def drop_reqpeat01():
    data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
    print data
    print data.duplicated()
    print data.drop_duplicates()
    data['v1']=range(7)
    print data.drop_duplicates(['k1'])
    print data
    print data.drop_duplicates(['k1','k2'],keep='last')
Example #2
0
def test_frame_datetime64_duplicated():
    dates = date_range('2010-07-01', end='2010-08-05')

    tst = DataFrame({'symbol': 'AAA', 'date': dates})
    result = tst.duplicated(['date', 'symbol'])
    assert (-result).all()

    tst = DataFrame({'date': dates})
    result = tst.duplicated()
    assert (-result).all()
Example #3
0
def test_duplicated_with_misspelled_column_name(subset):
    # GH 19730
    df = DataFrame({'A': [0, 0, 1],
                    'B': [0, 0, 1],
                    'C': [0, 0, 1]})

    with pytest.raises(KeyError):
        df.duplicated(subset)

    with pytest.raises(KeyError):
        df.drop_duplicates(subset)
def slide_10():
    data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                      'k2': [1, 1, 2, 3, 3, 4, 4]})
    print data
    print data.duplicated()
    print data.duplicated('k1')
    print data.drop_duplicates()

    data['v1'] = range(7)
    print data
    print data.drop_duplicates(['k1'])
    print data.drop_duplicates(['k1', 'k2'], take_last=True)
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
Example #6
0
    def submit(self, df: pd.DataFrame, job_opts: JobOpts, deplay=0.02, progressbar=True):
        """Sumit jobs to the cluster.

        You have to establish a connection first (explicit is better than implicit).

        Examples:
            >>> with js.connect():
            ...     js.submit([(0, 'echo "Hello world!"), (1, 'echo "Goodbye world!"')]
        """
        assert 'system_command' in df
        assert not df.duplicated().any()

        job_opts.working_dir.joinpath(job_opts.job_id).mkdir(parents=True, exist_ok=True)

        if self.host_opts.scheme in ['local']:
            worker = functools.partial(self._local_worker, job_opts=job_opts)
        else:
            worker = functools.partial(self._remote_worker, job_opts=job_opts)

        # Submit multiple jobs in parallel
        futures = []
        pool = concurrent.futures.ThreadPoolExecutor()
        for row in self._itertuples(df, progressbar=progressbar):
            future = pool.submit(worker, row)
            futures.append(future)
            time.sleep(deplay)
        pool.shutdown(wait=False)
        return futures
Example #7
0
class Duplicated(object):

    def setup(self):
        n = (1 << 20)
        t = date_range('2015-01-01', freq='S', periods=(n // 64))
        xs = np.random.randn(n // 64).round(2)
        self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
                             'b': np.random.choice(t, n),
                             'c': np.random.choice(xs, n)})
        self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T

    def time_frame_duplicated(self):
        self.df.duplicated()

    def time_frame_duplicated_wide(self):
        self.df2.duplicated()
def process_duplicated_entries(dfm_stk_strc:DataFrame,stockid):
    dfm_duplicated = dfm_stk_strc[dfm_stk_strc.duplicated(['变动日期'])]
    # print(dfm_duplicated)
    dfm_stk_strc.drop_duplicates('变动日期',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        # dfm_stk_strc.loc[index]['变动原因'] = dfm_stk_strc.loc[index]['变动原因'] +'|'+row['变动原因']
        dfm_stk_strc.loc[index,'变动原因'] = dfm_stk_strc.loc[index]['变动原因'] + '|' + row['变动原因']
        logprint('Stock %s 变动日期 %s 记录合并到主记录中. %s' %(stockid,row['变动日期'],tuple(row)))
Example #9
0
def test_duplicated_on_empty_frame():
    # GH 25184

    df = DataFrame(columns=['a', 'b'])
    dupes = df.duplicated('a')

    result = df[dupes]
    expected = df.copy()
    tm.assert_frame_equal(result, expected)
def deal_string02():
    import json
    db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json'))
    print len(db)
    print db[0]
    print db[0].keys()
    print db[0]['nutrients'][0]
    nutrients=DataFrame(db[0]['nutrients'])
    print nutrients[:7]
    info_keys=['description','group','id','manufacturer']
    info=DataFrame(db,columns=info_keys)
    print info[:5]
    print pd.value_counts(info.group)[:10]

    nutrients=[]
    for rec in db:
        fnuts=DataFrame(rec['nutrients'])
        fnuts['id']=rec['id']
        nutrients.append(fnuts)
    nutrients=pd.concat(nutrients,ignore_index=True)
    print nutrients
    print nutrients.duplicated().sum()
    nutrients=nutrients.drop_duplicates()
    col_mapping={'description':'food','group':'fgroup'}
    info=info.rename(columns=col_mapping,copy=False)
    print info
    col_mapping={'description':'nutrient','group':'nutgroup'}
    nutrients=nutrients.rename(columns=col_mapping,copy=False)
    print nutrients
    ndata=pd.merge(nutrients,info,on='id',how='outer')
    print ndata
    print ndata.ix[3000]
    result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
    # print result
    result['Zinc, Zn'].sort_values().plot(kind='barh')
    by_nutrient=ndata.groupby(['nutgroup','nutrient'])
    get_maximum=lambda x:x.xs(x.value.idxmax())
    get_minimum=lambda x:x.xs(x.value.idmin())
    max_foods=by_nutrient.apply(get_maximum)[['value','food']]
    max_foods.food=max_foods.food.str[:50]
    print max_foods.ix['Amino Acids']['food']
Example #11
0
def test_duplicated_do_not_fail_on_wide_dataframes():
    # gh-21524
    # Given the wide dataframe with a lot of columns
    # with different (important!) values
    data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
            for i in range(100)}
    df = DataFrame(data).T
    result = df.duplicated()

    # Then duplicates produce the bool Series as a result and don't fail during
    # calculation. Actual values doesn't matter here, though usually it's all
    # False in this case
    assert isinstance(result, Series)
    assert result.dtype == np.bool
Example #12
0
def test_duplicated_subset(subset, keep):
    df = DataFrame({'A': [0, 1, 1, 2, 0],
                    'B': ['a', 'b', 'b', 'c', 'a'],
                    'C': [np.nan, 3, 3, None, np.nan]})

    if subset is None:
        subset = list(df.columns)
    elif isinstance(subset, str):
        # need to have a DataFrame, not a Series
        # -> select columns with singleton list, not string
        subset = [subset]

    expected = df[subset].duplicated(keep=keep)
    result = df.duplicated(keep=keep, subset=subset)
    tm.assert_series_equal(result, expected)
def process_duplicated_entries(dfm_stk_info:DataFrame,stockid):
    dfm_duplicated = dfm_stk_info[dfm_stk_info.duplicated(['股权登记日'])]
    # print(dfm_duplicated)
    dfm_stk_info.drop_duplicates('股权登记日',inplace=True)
    for index, row in dfm_duplicated.iterrows():
        dfm_stk_info.loc[index,'分红年度'] = add_considering_None(dfm_stk_info.loc[index]['分红年度'],row['分红年度'])
        dfm_stk_info.loc[index,'分红方案'] = dfm_stk_info.loc[index]['分红方案'] + '|' + row['分红方案']
        if dfm_stk_info.loc[index]['方案文本解析错误标识位'] !='E':
            if row['方案文本解析错误标识位'] == 'E':
                dfm_stk_info.loc[index, '方案文本解析错误标识位'] = 'E'
                dfm_stk_info.loc[index, '派息(税前)(元)/10股'] = None
                dfm_stk_info.loc[index, '转增(股)/10股'] = None
                dfm_stk_info.loc[index, '送股(股)/10股'] = None
            else:
                dfm_stk_info.loc[index,'派息(税前)(元)/10股'] = add_considering_None(dfm_stk_info.loc[index]['派息(税前)(元)/10股'],row['派息(税前)(元)/10股'])
                dfm_stk_info.loc[index,'转增(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['转增(股)/10股'] , row['转增(股)/10股'])
                dfm_stk_info.loc[index,'送股(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['送股(股)/10股'] , row['送股(股)/10股'])
        logprint('Stock %s 股权登记日 %s 记录合并到主记录中. %s' %(stockid,row['股权登记日'],tuple(row)))
from pandas.io.parsers import TextParser
from numpy import NaN as NA
from lxml.html import parse
from urllib.request import urlopen
from lxml import objectify
from io import StringIO

###############################################################

data = DataFrame({'k1':['one'] * 3 + ['two'] * 4,
                  'k2':[1,1,2,3,3,4,4]})

print(data)
print('\n')

print(data.duplicated())
print('\n')

print(data.drop_duplicates())
print('\n')

data['v1'] = range(7)
print(data.drop_duplicates(['k1']))
print('\n')

print(data.drop_duplicates(['k1','k2'], take_last=True))
print('\n')

data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                  'ounces':[4,3,12,6,7.5,8,3,5,6]})
print(data)
Example #15
0
def test_duplicated_nan_none(keep, expected):
    df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object)

    result = df.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
Example #16
0
b[-1] = np.nan
# np中实现ifelse语句,a中空值位置用b替代
np.where(pd.isnull(a),b,a)
# pd中类似函数,b中控制用a替代
b[:-2].combine_first(a[2:])
# DataFrame中使用
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
	'b':[np.nan,2.,np.nan, 6.],
	'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
	'b':[np.nan,3.,4,6.,8.]})
df1.combine_first(df2)
## 移除重复数据
data = DataFrame({'k1':['one']*3+['two']*4,
	'k2':[1,1,2,3,3,4,4]})
data.duplicated()
# 去除重复值,默认留第一个
data.drop_duplicates()
# 根据某一列去除重复值
data['v1'] = range(7)
data.drop_duplicates(['k1'])
# 保留最后一个
data.drop_duplicates(['k1','k2'], take_last=True)

## 利用函数或映射进行数据转换
data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
	'corned beef','Bacon','pastrami','honey ham','nova lox'],
	'ounces':[4,3,12,6,7.5,8,3,5,6]})
meat_to_animal = {
	'bacon':'pig',
	'pulled pork':'pig',
def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# DUPLICATED VALUES -------------------------

## create new dataframe
zip3 = zip(['red', 'green', 'blue', 'orange']*3, [5, 10, 20, 40]*3, 
                [':(', ':D', ':D']*4)
                
df3 = DataFrame(zip3, columns = ['A', 'B', 'C'])

## pandas method `duplicated`
df3.duplicated() # searching from top to bottom by default
df3.duplicated(take_last = True) # searches bottom to top

## subset duplicated values
df3.duplicated(subset = ['A', 'B'])
df3.duplicated(['A', 'B'])

## HOW to get all values that have duplicates (purging)
t_b = df3.duplicated()
b_t = df3.duplicated(take_last = True)
unique = ~(t_b | b_t) # complement where either is true
unique = ~t_b & ~b_t
unique

df3[unique]

# DROPPING DUPLICATES --------------------------------------------
df3.drop_duplicates()
Example #19
0
f3 = pd.read_csv('rcs/macrodata.csv')
periods = pd.PeriodIndex(year=f3.year, quarter=f3.quarter, name='date')
f3 = DataFrame(f3.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))
ldata = f3.stack().reset_index().rename(columns={0: 'value'})
wdata = ldata.pivot('date', 'item', 'value')
# print ldata
# print wdata

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
# 去除重复值
# data.duplicated()会返回一个Bool的Series,表示各行是否是重复值
s1 = data.duplicated()
f4 = data.ix[np.logical_not(s1),]
# print f4
# drop_dumplicates会直接返回一个除去重复值的DataFrame
f5 = data.drop_duplicates()
# print f5
# 指定按某列过滤,保留的值为最后一个
f6 = data.drop_duplicates(['k1'], keep='last')
# print f6


# 给DataFrame添加一列,使其对应,map可以替换原Series对应的数据
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
Example #20
0
def test_drop_duplicates():
    df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
                            'foo', 'bar', 'bar', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': [1, 1, 2, 2, 2, 2, 1, 2],
                    'D': lrange(8)})

    # single column
    result = df.drop_duplicates('AAA')
    expected = df[:2]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep='last')
    expected = df.loc[[6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates('AAA', keep=False)
    expected = df.loc[[]]
    tm.assert_frame_equal(result, expected)
    assert len(result) == 0

    # multi column
    expected = df.loc[[0, 1, 2, 3]]
    result = df.drop_duplicates(np.array(['AAA', 'B']))
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates(['AAA', 'B'])
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AAA', 'B'), keep='last')
    expected = df.loc[[0, 5, 6, 7]]
    tm.assert_frame_equal(result, expected)

    result = df.drop_duplicates(('AAA', 'B'), keep=False)
    expected = df.loc[[0]]
    tm.assert_frame_equal(result, expected)

    # consider everything
    df2 = df.loc[:, ['AAA', 'B', 'C']]

    result = df2.drop_duplicates()
    # in this case only
    expected = df2.drop_duplicates(['AAA', 'B'])
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep='last')
    expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
    tm.assert_frame_equal(result, expected)

    result = df2.drop_duplicates(keep=False)
    expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
    tm.assert_frame_equal(result, expected)

    # integers
    result = df.drop_duplicates('C')
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates('C', keep='last')
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    df['E'] = df['C'].astype('int8')
    result = df.drop_duplicates('E')
    expected = df.iloc[[0, 2]]
    tm.assert_frame_equal(result, expected)
    result = df.drop_duplicates('E', keep='last')
    expected = df.iloc[[-2, -1]]
    tm.assert_frame_equal(result, expected)

    # GH 11376
    df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
                    'y': [0, 6, 5, 5, 9, 1, 2]})
    expected = df.loc[df.index != 3]
    tm.assert_frame_equal(df.drop_duplicates(), expected)

    df = DataFrame([[1, 0], [0, 2]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-2, 0], [0, -4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    x = np.iinfo(np.int64).max / 3 * 2
    df = DataFrame([[-x, x], [0, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    df = DataFrame([[-x, x], [x, x + 4]])
    tm.assert_frame_equal(df.drop_duplicates(), df)

    # GH 11864
    df = DataFrame([i] * 9 for i in range(16))
    df = df.append([[1] + [0] * 8], ignore_index=True)

    for keep in ['first', 'last', False]:
        assert df.duplicated(keep=keep).sum() == 0
Example #21
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
Example #22
0
pivoted = ldata.pivot('date', 'item')
pivoted[:5]

pivoted['value'][:5]

unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]


###移除重复数据
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

data.duplicated()

data.drop_duplicates()

data['v1'] = range(7)
data.drop_duplicates(['k1'])

data.drop_duplicates(['k1', 'k2'], take_last=True)


###利用函数或映射进行数据转换
#1
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
Example #23
0
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 17 09:44:35 2017

@author: HanKin
"""

from pandas import Series, DataFrame  
  
data = DataFrame({'k': [1, 1, 2, 2],'y':[2,2,4,1]})  
  
print(data)  
  
IsDuplicated = data.duplicated()  
  
print(IsDuplicated)  
print(type(IsDuplicated)) 
  
data = data.drop_duplicates()  
print(data)  
Example #24
0
def test_duplicated_keep(keep, expected):
    df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})

    result = df.duplicated(keep=keep)
    tm.assert_series_equal(result, expected)
Example #25
0
nutrients = DataFrame(db[0]['nutrients'])
print(nutrients[:7])

info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)
print(pd.value_counts(info.group)[:10])

nutrients = []
for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
print(nutrients[:10])
print(nutrients.duplicated().sum())

nutrients = nutrients.drop_duplicates()

col_mapping = {'description': 'food',
               'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info[:10])

col_mapping = {
    'description': 'nutrient',
    'group': 'nutgroup'
}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients[:10])
# -*- coding: utf-8 -*-
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                       重复数据处理   
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#%%
import pandas as pd
from pandas import Series,DataFrame
from string import letters
d1 = DataFrame({'a':['a','b']*6,'b':[1,2,3,4,5,6]*2,'c':[1,3,5]*4})

#%% 列出重复记录
d1.duplicated()
#%% 选择非重复记录
d1[d1.duplicated()==False]
#%% 按列计算重复
d1.duplicated('a')
#%% 按两个以上列
d1.duplicated(['a','c'])
#%% 保留最后一个元素
d1.duplicated('a',take_last=True)

#%% 删除重复 
# drop_duplicates() 等效于执行 d1[d1.duplicated()==False]
d1.drop_duplicates()

Example #27
0
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
# dfMerged = pd.merge(df1, df2, on='key')
# print dfMerged
# dfMergedOuter = pd.merge(df1, df2, how='outer')
# print dfMergedOuter

df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
# dfMerged = pd.merge(df3, df4, left_on='lkey', right_on='rkey')
# print dfMerged

left = DataFrame({'key1':['foo', 'foo', 'bar'], 'key2':['one', 'foo', 'one'], 'lval':[1, 2, 3]})

right = DataFrame({'key1':['foo', 'foo', 'bar', 'bar'], 'key2':['one', 'foo', 'one', 'one'], 'rval':[4, 5, 6, 7]})

dfMergedOuter = pd.merge(left, right, how='outer')
# print dfMergedOuter

arr = np.arange(12).reshape((6,2))
# print arr
# print np.arange(12)
arrConcat = np.concatenate([arr, arr], axis = 1)
# print arrConcat

data = DataFrame({'k1': ['one']*3 + ['two']*4, 'k2': [1,1,2,3,3,4,4]})
# print data
dataDuplicate = data.duplicated()
print dataDuplicate
dropDuplicate = data.drop_duplicates()
print dropDuplicate
Example #28
0
match = regex_email.findall(email)


df = DataFrame(match)

# ----------------------------------------cours sur pandas

# df tableau des élèves avec @mail
# Dataframe.rename pour renommer les colonnes d'un DF
df = df.rename (columns = {'ancien' : 'nouveau', 'ancien' : 'nouveau'})

# on peut aussi faire un lambda pour changer les index de lignes
df.index = df.index.map(lambda x: 'Eleve ' + str(x))

# on peut supprimer les doublons
df.duplicated() #pour voir s'il y a des doublons
df.drop_duplicate() # pour les supprimer

# -------------------------- traiter le fichier aliments.csv
aliments = pd.read_csv('aliments.csv')

# realiser une matrice d'aliments x traces contenues
aliments['traces]'].isnull()
aliments_with_traces = aliments['traces'].dropna()

traces_iter = (set(x.split(',')) for x in aliments_with_traces['traces'])
traces = set.union(traces_iter)

DataFrame(np.zeros((len(aliments_with_traces), len(traces))), columns= traces)
for i, tr in enumerate(aliments_with_traces.traces):
	dummies.ix[i, tr.split(',')] = 1
Example #29
0
info = DataFrame(db,  columns=info_keys)
print info[:5]
print info

print pd.value_counts(info.group)[:10]

nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)
print nutrients
print nutrients.duplicated().sum()

nutrients = nutrients.drop_duplicates()

col_mapping = {'description': 'food',
               'group': 'fgroup'}

info = info.rename(columns=col_mapping, copy=False)
print info

col_mapping = {'description': 'nutrient',
               'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print nutrients

ndata = pd.merge(nutrients, info, on='id', how='outer')
from pandas import Series, DataFrame
import pandas as pd
import numpy as np


# DUPLICATED VALUES -----------------------------------

## create a new data frame
zip3 = zip(['red', 'green', 'blue', 'orange']*4, [5, 10, 20, 40]*3, [':(', ':D', ':D']*4)
df3 = DataFrame(zip3, columns = ['A', 'B', 'C'])
df3

## returns boolean vector of duplicated rows of a whole DataFrame or subset using method `duplicated`
## IMPORTANT: python, by default, searches for duplicated values from top-to-bottom
## and will not mark a row as "duplicated" until it actually finds another instance
df3.duplicated() # defaults using all rows searching top-to-bottom
df3.duplicated(take_last = True) # option `take_last = True` searches bottom-to-top

## SUBSET duplicates
# if we want the duplicated criteria to be of a subset, we can do that too
df3.duplicated(subset = ['A', 'B'])
df3.duplicated(['A', 'B']) # same as before


## HOW to get all values that have a duplicate
t_b = df3.duplicated()
b_t = df3.duplicated(take_last = True)
unique = ~(t_b | b_t) # negate where either is true
unique
unique = ~t_b & ~b_t # same as above
unique