Example #1
0
    def build_similarity_matrix(self, fingerprint_matrices):
        if fingerprint_matrices == None:
            return None
        names = fingerprint_matrices.keys()
        similarity_matrix = DataFrame(index=names, columns=names)

        if self.sim_pickle_path is not None:
            if os.path.isfile(self.sim_pickle_path):
                print "Found pickled similarity matrix at '" + self.sim_pickle_path +"', importing..."
                with open(self.sim_pickle_path, 'rb') as sim_pickle:
                    similarity_matrix.update(pickle.load(sim_pickle))
            else:
                print "Warning: was asked to look for similarity matrix at '" + self.sim_pickle_path +"'"
                print "Couldn't find one -- new pickle file will be created."

        for name1, fp1 in fingerprint_matrices.iteritems():
            for name2, fp2 in fingerprint_matrices.iteritems():
                #print "Comparing: " + name1 + " and " + name2
                if name1 == name2:
                    similarity_matrix.loc[name1, name2] = -1
                elif np.isnan(similarity_matrix.loc[name1, name2]):      
                    comparison_result = self.compare(fp1, fp2)
                    similarity_measure = self.similarity_measure(comparison_result)
                    similarity_matrix.loc[name1, name2] = similarity_measure
                    similarity_matrix.loc[name2, name1] = similarity_measure

        return similarity_matrix
    def test_update_raise_on_overlap(self):
        df = DataFrame([[1.5, 1, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[2., nan],
                           [nan, 7]], index=[1, 3], columns=[1, 2])
        with pytest.raises(ValueError, match="Data overlaps"):
            df.update(other, errors='raise')
    def test_update_raise(self):
        df = DataFrame([[1.5, 1, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[2., nan],
                           [nan, 7]], index=[1, 3], columns=[1, 2])
        with assertRaisesRegexp(ValueError, "Data overlaps"):
            df.update(other, raise_conflict=True)
    def test_update_dtypes(self):

        # gh 3016
        df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
                       columns=['A', 'B', 'bool1', 'bool2'])

        other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
        df.update(other)

        expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
                             columns=['A', 'B', 'bool1', 'bool2'])
        assert_frame_equal(df, expected)
Example #5
0
class InfoTable(DataFrameWidget):
    def __init__(self, samples=None):
        self.initVars()
        super(InfoTable, self).__init__(self.table)

    def initVars(self):
        """Initialises variables."""
        self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"]
        self.table = DataFrame(columns=self.columns)

    ########################################################################
    def update(self):
        plateID = self.table["Plate ID"]
        plateName = self.table["Plate Name"]
        plateKea = self.table["Plate Kea"]
        well = self.table["Well"]
        self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1)
        self.table.insert(0, "Plate ID", plateID)
        self.table.insert(1, "Plate Name", plateName)
        self.table.insert(2, "Plate Kea", plateKea)
        self.table.insert(3, "Well", well)
        self.setDataFrame(self.table)

    def append(self, appendage):
        self.table = self.table.append(appendage, ignore_index=True)
        self.update()

    def editPlates(self, edits):
        self.table = self.table.set_index("Plate ID")
        edits = edits.set_index("ID")
        self.table.update(edits)
        self.table = self.table.reset_index()

    def importPlateData(self, plateData, key):
        plateData = plateData.set_index(key)
        self.table = self.table.set_index(key)
        self.table.update(plateData)
        self.table = self.table.reset_index()

    def importSampleData(self, sampleData, tableKey, importKey):
        sampleData[tableKey] = sampleData[importKey]
        sampleData = sampleData.set_index(tableKey)
        self.table = self.table.set_index(tableKey)
        self.table = self.table.join(sampleData, rsuffix="_new")
        self.table = self.table.reset_index()

    def getKeaSexTestingData(self):
        table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]]
        table = table.set_index(["Plate ID", "Well"])
        table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True)
        return table
    def test_update_filtered(self):
        df = DataFrame([[1.5, nan, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[3.6, 2., np.nan],
                           [np.nan, np.nan, 7]], index=[1, 3])

        df.update(other, filter_func=lambda x: x > 2)

        expected = DataFrame([[1.5, nan, 3],
                              [1.5, nan, 3],
                              [1.5, nan, 3],
                              [1.5, nan, 7.]])
        assert_frame_equal(df, expected)
    def test_update_nooverwrite(self):
        df = DataFrame([[1.5, nan, 3.],
                        [1.5, nan, 3.],
                        [1.5, nan, 3],
                        [1.5, nan, 3]])

        other = DataFrame([[3.6, 2., np.nan],
                           [np.nan, np.nan, 7]], index=[1, 3])

        df.update(other, overwrite=False)

        expected = DataFrame([[1.5, nan, 3],
                              [1.5, 2, 3],
                              [1.5, nan, 3],
                              [1.5, nan, 3.]])
        assert_frame_equal(df, expected)
    def test_update_nan(self):
        # #15593 #15617
        # test 1
        df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
        df2 = DataFrame({'A': [None, 2, 3]})
        expected = df1.copy()
        df1.update(df2, overwrite=False)

        tm.assert_frame_equal(df1, expected)

        # test 2
        df1 = DataFrame({'A': [1.0, None, 3],
                         'B': date_range('2000', periods=3)})
        df2 = DataFrame({'A': [None, 2, 3]})
        expected = DataFrame({'A': [1.0, 2, 3],
                              'B': date_range('2000', periods=3)})
        df1.update(df2, overwrite=False)

        tm.assert_frame_equal(df1, expected)
    def test_update_from_non_df(self):
        d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
        df = DataFrame(d)

        d['a'] = Series([5, 6, 7, 8])
        df.update(d)

        expected = DataFrame(d)

        assert_frame_equal(df, expected)

        d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
        df = DataFrame(d)

        d['a'] = [5, 6, 7, 8]
        df.update(d)

        expected = DataFrame(d)

        assert_frame_equal(df, expected)
 def test_update_deprecation(self, raise_conflict):
     df = DataFrame([[1.5, 1, 3.]])
     other = DataFrame()
     with tm.assert_produces_warning(FutureWarning):
         df.update(other, raise_conflict=raise_conflict)
 def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg):
     df = DataFrame([[1.5, 1, 3.]])
     with pytest.raises(exception, match=msg):
         df.update(df, **bad_kwarg)
Example #12
0
different = DataFrame([[1,1],[2,2],[3.0,3]],index=['c','d','e'], columns=['one','two'])
original.reindex_like(different)
original.reindex_axis(['two','one'], axis = 1)

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
right = DataFrame([[1,2],[3,4],[7,8]],columns=['one','three'])
left.merge(right,on='one') # Same as how='inner'
left.merge(right,on='one', how='left')
left.merge(right,on='one', how='right')
left.merge(right,on='one', how='outer')

left = DataFrame([[1,2],[3,4],[5,6]],columns=['one','two'])
left
right = DataFrame([[nan,12],[13,nan],[nan,8]],columns=['one','two'],index=[1,2,3])
right
left.update(right) # Updates values in left
left

subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','region']]
subset.head()
grouped_data = subset.groupby(by='region')
grouped_data.groups # Lists group names and index labels for group membership
grouped_data.mean()  # Same as a pivot table

subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','gdp_growth_2011','gdp_growth_2012']]
subset.index = state_gdp['state_code'].values
subset.head()
subset.apply(mean) # Same as subset.mean()
subset.apply(mean, axis=1).head() # Same as subset.mean(axis=1)

subset = state_gdp[['gdp_growth_2009','gdp_growth_2010','region']]
def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
Example #14
0
class ResultsTable(DataFrameWidget):
	'''The Class implementing the table in the Results tab of fyd2.'''
	def __init__(self,samples=None):
		self.initVars()
		super(ResultsTable,self).__init__(self.table)
		
	def initVars(self):
		'''Initialises variables.'''
		self.columns					= ['Plate ID','Plate Name','Plate Kea','Well','Population',
											'Crop','Experiment','LC Well','Result','Group',
											'Exists','Grind','Concentration','Include']
		self.table						= DataFrame(columns=self.columns)
	
########################################################################
	def update(self):
		'''Resets the booleans to booleans and reorders the columns.'''
		self.table['Exists']			= self.table['Exists'].map(lambda e: bool(e))
		self.table['Grind']				= self.table['Grind'].map(lambda g: bool(g))
		self.table['Include']			= self.table['Include'].map(lambda g: bool(g))
		self.table						= self.table[self.columns]
		self.setDataFrame(self.table)
		
	def append(self,appendage):
		'''Append the samples in appendage to the table.'''
		self.table						= self.table.append(appendage,ignore_index=True)

	def editPlates(self,edits):
		'''Finds plates by Plate ID and edits data. Used in the 'Edit Data'
		menu item.'''
		self.table				= self.table.set_index('Plate ID')
		edits					= edits.set_index('ID')
		self.table.update(edits)
		self.table				= self.table.reset_index()

########################################################################		
	def importPlateData(self,plateData,key):
		'''Updates the results table with the data read from the Plates
		Records spreadsheet.'''
		plateData				= plateData.set_index(key)
		self.table				= self.table.set_index(key)
		self.table.update(plateData)
		self.table				= self.table.reset_index()
		plateData				= plateData.reset_index()
	
		self.importPlateDataNonSamples(plateData)
		self.importPlateDataBadGrinds(plateData)
		
	def importPlateDataNonSamples(self,plateData):
		'''Updates the non-existing sample column from plateData, which 
		has been read from the Plates Records spreadsheet.'''
		plateData				= plateData[['Plate ID','Non-harvested plants']]
		exists					= plateData.dropna(how='all',subset=['Non-harvested plants'])
		exists					= DataFrame(exists['Non-harvested plants'].str.split(' ').tolist(),index=exists['Plate ID']).stack()
		exists					= exists.reset_index().drop('level_1',1)
		exists.columns			= ['Plate ID','Well']
		exists['Exists']		= False
		exists					= exists.set_index(['Plate ID','Well'])
		self.table				= self.table.set_index(['Plate ID','Well'])
		self.table.update(exists)
		self.table				= self.table.reset_index()
		
	def importPlateDataBadGrinds(self,plateData):
		'''Updates the Bad Grinds column from plateData, which has been
		read from the Plates Records spreadsheet.'''
		plateData				= plateData[['Plate ID','Bad Grinds']]
		badGrinds				= plateData.dropna(how='all',subset=['Bad Grinds'])
		badGrinds				= DataFrame(badGrinds['Bad Grinds'].str.split(' ').tolist(),index=badGrinds['Plate ID']).stack()
		badGrinds				= badGrinds.reset_index().drop('level_1',1)
		badGrinds.columns		= ['Plate ID','Well']
		badGrinds['Grind']		= False
		badGrinds				= badGrinds.set_index(['Plate ID','Well'])
		self.table				= self.table.set_index(['Plate ID','Well'])
		self.table.update(badGrinds)
		self.table				= self.table.reset_index()

########################################################################		
	def setCrop(self,crop):
		'''Sets the item in the crop menu for all samples.'''
		self.table['Crop']		= crop
		
	def addLCFiles(self,fileDataLists):
		'''Adds lightcyler results for given fileDataLists.'''
		lc							= DataFrame(columns=['LC Well','Result','Experiment','Plate','Well'])
		for fDL in fileDataLists:
			name,plate,exp,robot,pos= fDL
			lcFrame					= read_table(name,sep='\t',header=1)
			lcFrame					= lcFrame.drop(['Include','Color','Name','Status'],1)
			lcFrame['Experiment']	= exp
			lcFrame['Plate']		= plate
			lcFrame['Well']			= lcFrame['Pos'].map(lambda x: convert[robot][pos][x])
			lcFrame['Group']		= lcFrame['Group'].astype(str)
			lcFrame.columns			= ['LC Well','Result','Experiment','Plate','Well',]
			lc						= lc.append(lcFrame)
		lc							= lc.set_index(['Plate','Well'])
		self.table					= self.table.set_index(['Plate ID','Well'])
		self.table.update(lc)
		self.table					= self.table.reset_index()
		
	def addTaqFiles(self,fileDataLists):
		'''Adds Taqman results for given fileDataLists.'''
		lc							= DataFrame(columns=['LC Well','Result','Experiment','Plate','Well'])
		for fDL in fileDataLists:
			name,plate,exp,robot,pos= fDL
			lcFrame					= read_table(name,sep='\t',header=1)
			lcFrame					= lcFrame.drop(['Include','Color','465-510','618-660','Score','Status'],1)
			lcFrame['Experiment']	= exp
			lcFrame['Plate']		= plate
			lcFrame['Well']			= lcFrame['Pos'].map(lambda x: convert[robot][pos][x])
			lcFrame['Call']			= lcFrame['Call'].astype(str)
			lcFrame					= lcFrame[['Pos','Call','Experiment','Plate','Well']]
			lcFrame.columns			= ['LC Well','Result','Experiment','Plate','Well',]
			lc						= lc.append(lcFrame)
		lc							= lc.set_index(['Plate','Well'])
		self.table					= self.table.set_index(['Plate ID','Well'])
		self.table.update(lc)
		self.table					= self.table.reset_index()

########################################################################
	def negativiseUnknowns(self):
		'''Sets all Unknown results to Negative. Used for Brassica/Ryegrass.'''
		self.table['Result']		= self.table['Result'].map(lambda res: 'Negative' if res == 'Unknown' else res)
		
	def setNonExistsToNegative(self):
		'''Reads the Exists column, and if it is False, sets the corresponding
		item in the Group column to Negative.'''
		self.table['Group']			= self.table.apply(lambda x: x['Group'] if x['Exists'] else 'Negative',1)
	
	def setNonExistsToNoSample(self):
		'''Reads the Exists column, and if it is False, sets the corresponding
		item in the Group column to No Sample.'''
		self.table['Group']			= self.table.apply(lambda x: x['Group'] if x['Exists'] else 'No sample',1)

	def includeAll(self):
		'''Sets the Include column to True for all samples.'''
		self.table['Include']		= True
		
	def excludeFailGrinds(self):
		'''Sets the Include column to False for failed grinds.'''
		self.table['Include']		= self.table.apply(lambda x: False if not(x['Grind']) else x['Include'],1)
		
	def excludeNegativeFailGrinds(self):
		'''Sets the Include column to False for failed grinds whose Group
		is Negative.'''
		self.table['Include']		= self.table.apply(lambda x: False if not(x['Grind']) and x['Group']=='Negative' else x['Include'],1)
		
	def excludeNonExists(self):
		'''Sets the Include column to False for samples that don't exist.'''
		self.table['Include']		= self.table.apply(lambda x: False if not(x['Exists']) else x['Include'],1)

	def missingResults(self):
		'''Determines if there are missing entries in Results or Groups'''
		return self.table['Result'].isnull().any(), self.table['Group'].isnull().any()
		
########################################################################
	def getPopulations(self):
		'''Returns a list of all populations in Population column.'''
		return unique(self.table.Population.ravel())
		
	def getGroups(self):
		'''Returns a list of all groups in Group column.'''
		return unique(self.table.Group.ravel())
		
	def getExperiments(self):
		'''Returns a list of all experiments in Experiment column.'''
		return unique(self.table.Experiment.ravel())
		
	def getCherriesByPop(self,cherryData):
		'''Gets cherrypicking data for passed populations/results.'''
		cherries					= DataFrame(columns=['Source plate','Dest plate',
														'Source Position','Source Well',
														'Destination Position','Destination Well',
														'Volume (ul)','Run'])
		for pop,groups,samples in cherryData:
			cherry					= self.table[['Plate ID','Well','Population','Group']]
			cherry					= cherry.loc[(cherry.Population==pop) & (cherry.Group.isin(groups))]
			cherry					= cherry.reset_index().head(samples)[['Plate ID','Well']]
			cherry.columns			= ['Source plate','Source Well']
			cherries				= cherries.append(cherry)
		cherries					= self.fillCherryData(cherries)
		return cherries
		
	def getCherriesNU(self):
		'''Gets cherrypicking data for Negatives and Unknowns.'''
		cherries					= DataFrame(columns=['Source plate','Dest plate',
														'Source Position','Source Well',
														'Destination Position','Destination Well',
														'Volume (ul)','Run'])
		cherry						= self.table[['Plate ID','Well','Group']]
		cherry						= cherry.loc[cherry.Group.isin(['Negative','Unknown'])]
		cherry						= cherry.reset_index()[['Plate ID','Well']]
		cherry.columns				= ['Source plate','Source Well']
		cherries					= cherries.append(cherry)
		cherries					= self.fillCherryData(cherries)
		return cherries
	
	def fillCherryData(self,cherries):
		'''Fills out the cherrypicking datatables, with Source Position,
		Destination Position, Destination Well, Run, and Volume.'''
		controls					= 2
		plateNames					= 'CP'
		sources						= [4,5,7,8]
		dests						= [10,11]
		wellsList					= [l+str(n) for n in range(1,13) for l in 'ABCDEFGH'][controls:]
		wells						= len(wellsList)
		rows						= len(cherries)
		cherries['Volume (ul)']		= 50
		cherries['Destination Well']= wellsList * (rows/wells) + wellsList[:rows%wells]
		cherries['Dest plate']		= [plateNames + str(i/wells+1) for i in range(rows)]
		sourcePlates				= unique(cherries['Source plate'].ravel())
		destPlates					= unique(cherries['Dest plate'].ravel())
		sourceDict 					= {i: j for i,j in itertools.izip(sourcePlates,itertools.cycle(sources))}
		destDict					= {i: j for i,j in itertools.izip(destPlates,itertools.cycle(dests))}
		cherries['Source Position']	= cherries['Source plate'].apply(lambda x: sourceDict[x],1)
		cherries['Destination Position']	= cherries['Dest plate'].apply(lambda x: destDict[x],1)
		self.run					= 1
		self.sources				= []
		self.dests					= []
		Run							= []
		for row in cherries.itertuples():
			source					= row[5]
			dest					= row[2]
			if not self.sources or source != self.sources[-1]:
				self.sources.append(source)
			if not self.dests or dest != self.dests[-1]:
				self.dests.append(dest)
			if len(self.sources) > 4 or len(self.dests) > 2:
				self.run = self.run + 1
				self.sources		= [source]
				self.dests			= [dest]
			Run.append('Run ' + format(self.run,'03d'))
		cherries['Run']				= Run
		cherries['Source Position']	= cherries['Source Position'].apply(lambda x: 'P'+str(x)) 
		cherries['Destination Position']	= cherries['Destination Position'].apply(lambda x: 'P'+str(x))
		cherries					= cherries[['Source plate','Dest plate','Source Position','Source Well',
												'Destination Position','Destination Well','Volume (ul)','Run']]
		return cherries
		
########################################################################
	def getKeaSexTestingData(self):
		'''Gets the data required by the Kea Sex testing process run.
		Some conversion is required.'''
		kea							= {'Male': 		'M',
										'Female':	'F',
										'1':		'1',
										'2':		'2',
										'3':		'3',
										'4':		'4',
										'Negative':	'U',
										'Unknown':	'U',
										'No Sample':'U',}
		data						= ['Plate ID','Plate Kea','Well','Experiment','Group']
		
		table						= self.table[data]
		table['Group']				= table['Group'].apply(lambda x: kea.get(x,'U'))
		table.rename(columns={'Plate Kea':			'Plate',
								'Experiment':		'Slipstream Expt No',
								'Plant Alt Names':	'Plant AltName',
								'Group':			'Sex Marker Results'}, inplace=True)
		table						= table.set_index(['Plate ID','Well'])
		return table
Example #15
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
Example #16
0
 def test_update_datetime_tz(self):
     # GH 25807
     result = DataFrame([pd.Timestamp('2019', tz='UTC')])
     result.update(result)
     expected = DataFrame([pd.Timestamp('2019', tz='UTC')])
     assert_frame_equal(result, expected)