def get_link(url): link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>') links = [] # open web content f = urllib2.urlopen(url) content = f.read() # versi find html tag : find all url and save to links # soup = BeautifulSoup(content, "lxml") # for a in soup.find_all('a', href=True): # if "detik.com" in a['href']: # if "http:" not in a['href']: # a['href'] = "http:" + a['href'] # print "Found the URL:", a['href'] # links.append(a['href']) # versi regex : find all url and save to links for link in link_exr.findall(content): if "detik.com" in link[0]: link_detik = link[0] if "http:" not in link_detik: link_detik = "http:" + link_detik links.append(link_detik) # save to DataFrame df = DataFrame(links, columns=['detik url']) df.drop_duplicates() print df.head(0) # create and save to sqlite database detik_db = create_engine("mysql://*****:*****@localhost/data_detik") df.to_sql('url_detik', detik_db, if_exists='replace')
def removeDuplicate(file): """Removes duplicate points based on X, Y coordinates Returns a numpy array""" df = DataFrame(np.vstack((file.x, file.y, file.z)).transpose(), columns=['X', 'Y', 'Z']) df.drop_duplicates(subset=['X','Y'], inplace=True) return df.values
def test_drop_duplicates_tuple(): df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': [1, 1, 2, 2, 2, 2, 1, 2], 'D': lrange(8)}) # single column result = df.drop_duplicates(('AA', 'AB')) expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep='last') expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep=False) expected = df.loc[[]] # empty df assert len(result) == 0 tm.assert_frame_equal(result, expected) # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates((('AA', 'AB'), 'B')) tm.assert_frame_equal(result, expected)
def process_duplicated_entries(dfm_stk_strc:DataFrame,stockid): dfm_duplicated = dfm_stk_strc[dfm_stk_strc.duplicated(['变动日期'])] # print(dfm_duplicated) dfm_stk_strc.drop_duplicates('变动日期',inplace=True) for index, row in dfm_duplicated.iterrows(): # dfm_stk_strc.loc[index]['变动原因'] = dfm_stk_strc.loc[index]['变动原因'] +'|'+row['变动原因'] dfm_stk_strc.loc[index,'变动原因'] = dfm_stk_strc.loc[index]['变动原因'] + '|' + row['变动原因'] logprint('Stock %s 变动日期 %s 记录合并到主记录中. %s' %(stockid,row['变动日期'],tuple(row)))
def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({'A': [0, 0, 1], 'B': [0, 0, 1], 'C': [0, 0, 1]}) with pytest.raises(KeyError): df.duplicated(subset) with pytest.raises(KeyError): df.drop_duplicates(subset)
def _decode_solutions(self, solutions): decoded_solutions = DataFrame(columns=["targets", "fitness"]) index = 0 for solution in solutions: combinations = self._decoder(solution.candidate, flat=True, decompose=True) for targets in combinations: if len(targets) > 0: decoded_solutions.loc[index] = [tuple(targets), solution.fitness] index += 1 decoded_solutions.drop_duplicates(inplace=True, subset="targets") decoded_solutions.reset_index(inplace=True) return decoded_solutions
def test_drop_duplicates_with_duplicate_column_names(): # GH17836 df = DataFrame([ [1, 2, 5], [3, 4, 6], [3, 4, 7] ], columns=['a', 'a', 'b']) result0 = df.drop_duplicates() tm.assert_frame_equal(result0, df) result1 = df.drop_duplicates('a') expected1 = df[:2] tm.assert_frame_equal(result1, expected1)
def get_dominant_alleles( sample_ids, marker_ids = None ): t1 = aliased( Allele ) t2 = aliased( Allele ) s1 = aliased( AlleleSet ) marker_ids = [ int(x) for x in marker_ids ] # I honestly forgot how the mundane thing below works !!! # I was smarter when I was younger 8-( q = dbsession.query( t1.marker_id, s1.sample_id, t1.value, t1.size, t1.height ).\ join( s1 ).\ outerjoin( t2, and_( t1.marker_id == t2.marker_id, t1.alleleset_id == t2.alleleset_id, t1.height < t2.height) ).\ filter( t2.marker_id == None ).order_by( t1.marker_id, s1.sample_id ).\ filter( s1.sample_id.in_( sample_ids ) ).filter( t1.marker_id.in_( marker_ids )) df = DataFrame( [ (marker_id, sample_id, value, size, height) for marker_id, sample_id, value, size, height in q ] ) if len(df) == 0: return None df.columns = ( 'marker_id', 'sample_id', 'value', 'size', 'height' ) return df.drop_duplicates( ['marker_id', 'sample_id'] )
def recursive_add_consumers(consumer_id, seen = set([])): if consumer_id is None: return seen.add(consumer_id) consumer_key = sample[sample.Consumer == consumer_id] IP = df.drop_duplicates(df(consumer_key.IP)) n = np.array(np.arange(len(IP))) IP_Map = set([]) for i in n: value = sample[sample.IP.isin([IP.iloc[i,0]])] IP_Map.add(value) #print IP_Map print consumer_id print seen consumer_list = [] #list of unique consumers that are linked to this one [consumer_list.extend(y.Consumer.iloc[l].tolist()) for l in [range(len(y.Consumer)) for y in IP_Map]] #print consumer_list #print [x for x in set(consumer_list).difference([consumer_id])] #unique_consumer_list = [] #print [ x for x in set([y.Consumer.iloc[0] for y in IP_Map])] #tuples of ips and unique consumers attached to them print [(y.IP.iloc[0],set(y.Consumer.iloc[l].tolist())) for l in [range(len(y.Consumer)) for y in IP_Map]]
def getDateTimeSeries(self,instrument=None): if instrument is None: __dateTime = DataFrame() for element in self.__instrument: __dateTime = __dateTime.append(self.__feed[element].getPriceDataSeries().getDateTimes()) __dateTime = __dateTime.drop_duplicates([0]) return __dateTime.values #此时返回的为二维数组 return self.__feed[instrument].getPriceDataSeries().getDateTimes()
def process_duplicated_entries(dfm_stk_info:DataFrame,stockid): dfm_duplicated = dfm_stk_info[dfm_stk_info.duplicated(['股权登记日'])] # print(dfm_duplicated) dfm_stk_info.drop_duplicates('股权登记日',inplace=True) for index, row in dfm_duplicated.iterrows(): dfm_stk_info.loc[index,'分红年度'] = add_considering_None(dfm_stk_info.loc[index]['分红年度'],row['分红年度']) dfm_stk_info.loc[index,'分红方案'] = dfm_stk_info.loc[index]['分红方案'] + '|' + row['分红方案'] if dfm_stk_info.loc[index]['方案文本解析错误标识位'] !='E': if row['方案文本解析错误标识位'] == 'E': dfm_stk_info.loc[index, '方案文本解析错误标识位'] = 'E' dfm_stk_info.loc[index, '派息(税前)(元)/10股'] = None dfm_stk_info.loc[index, '转增(股)/10股'] = None dfm_stk_info.loc[index, '送股(股)/10股'] = None else: dfm_stk_info.loc[index,'派息(税前)(元)/10股'] = add_considering_None(dfm_stk_info.loc[index]['派息(税前)(元)/10股'],row['派息(税前)(元)/10股']) dfm_stk_info.loc[index,'转增(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['转增(股)/10股'] , row['转增(股)/10股']) dfm_stk_info.loc[index,'送股(股)/10股'] = add_considering_None(dfm_stk_info.loc[index]['送股(股)/10股'] , row['送股(股)/10股']) logprint('Stock %s 股权登记日 %s 记录合并到主记录中. %s' %(stockid,row['股权登记日'],tuple(row)))
def compute_ambiguity(self): results = self.cursor.execute('SELECT type, entity FROM entities') tmp = results.fetchall() freq_df = DataFrame(tmp, columns=['e_type', 'entity']) freq_df['ambiguity'] = 10 freq_df = freq_df.drop_duplicates() result_computed_location = self._compute_location_ambiguity(freq_df) result_computed_name = self._compute_name_ambiguity(result_computed_location) return result_computed_name
def IP_Weight_Calc(consumer_id): if consumer_id == None: return consumer_key = sample[sample.Consumer == consumer_id] IP = df.drop_duplicates(df(consumer_key.IP)) n = np.array(np.arange(len(IP))) IP_Weight_List = [] for i in n: value = sample[sample.IP.isin([IP.iloc[i,0]])] value2 = len(df.drop_duplicates(df(value.Consumer))) value3 = (1/(value2**2)) IP_Weight_List.append(value3) return sum(IP_Weight_List)
def _compute_consistency(self): results = self.cursor.execute('SELECT did, type, entity FROM entities') tmp = results.fetchall() df = DataFrame(tmp, columns=['did', 'e_type', 'entity']) df = df.drop_duplicates() tmp = df.groupby(['e_type', 'entity']).size().reset_index() tmp.rename(columns={0: 'consistency'}, inplace=True) return tmp
def getSubwayDate(filepath2): allFiles = DataFrame(os.listdir(filepath2)) allFiles.columns = ['FileNames'] allFiles = allFiles[allFiles.FileNames.str.slice(0, 6) == 'gtfs-2'] allFiles['datetime1'] = allFiles.FileNames.str.slice(5) allFiles.datetime1 = allFiles.datetime1.map(parser.parse) allFiles.datetime1 = pd.DatetimeIndex(allFiles.datetime1).tz_convert('America/New_York') times1 = [] for dt in allFiles.datetime1: tm = dt.time() times1.append(tm) allFiles['time1'] = Series(times1) allFiles = allFiles[allFiles.time1 >= time(6)] allFiles = allFiles[allFiles.time1 <= time(9, 30)] df_allTrains = DataFrame(np.zeros(0, dtype = [('current_status', 'O')])) df_trips = DataFrame(np.zeros(0, dtype = [('route_id', 'O')])) #old_df = getTrains(allFiles.FileNames.iloc[0], test_df) for fileName0 in allFiles.FileNames: fileName1 = os.path.join(filepath2,fileName0) try: df_allTrains, df_trips = getTrains(fileName1, df_allTrains, df_trips) except: print 'file ' + fileName0 + ' did not work' continue df_allTrains = df_allTrains.sort_values(['trip_id', 'timestamp', 'timestamp1']) unique_ids = ['start_date','route_id','trip_id', 'train_id', 'stop_id', 'stop_name', 'current_status', 'timestamp'] df_allTrains.drop_duplicates(unique_ids, keep = 'last', inplace = True) unique_ids.remove('timestamp') df_grouped = df_allTrains.groupby(unique_ids) df_grouped1 = df_grouped['timestamp'].agg({'minTS1': min, 'maxTS1':max}) df_grouped2 = df_grouped['timestamp1'].agg(max) df_grouped1['maxTS2'] = df_grouped2 df_allTrains = df_grouped1.reset_index() df_allTrains.sort_values(['trip_id', 'minTS1'], inplace = True) df_trips = df_trips.sort_values(['trip_id', 'stop_id', 'timestamp1']) shutil.rmtree(filepath2) return(df_allTrains, df_trips)
def slide_21(): import json db = json.load(open(FOODJSONPATH)) print len(db) print db[0].keys() print db[0]['nutrients'][0] nutrients = DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] print "今から全部のnutrientsを扱うよ" nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print "なんか重複多い" print nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() print "infoとnutrients両方にdescriptionとgroupがあるから変えよう" col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) ndata = pd.merge(nutrients, info, on='id', how='outer') print ndata.ix[30000] result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh') plt.show() by_nutrient = ndata.groupby(['nutgroup', 'nutrient']) get_maximum = lambda x: x.xs(x.value.idxmax()) get_minimum = lambda x: x.xs(x.value.idxmin()) max_foods = by_nutrient.apply(get_maximum)[['value', 'food']] max_foods.food = max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def bulk_download(year, bucket): bucket = expanduser(bucket) filepath = bucket + 'latvia' + str(year) + '.csv' data = DataFrame() for scheme in SCHEMES: fragment = Fragment(scheme, year, bucket) if not fragment.is_cached: fragment.download() fragment.load_from_csv() fragment.cleanup() data = concat([data, fragment.data], ignore_index=True) log.debug('Added %s rows to bulk dataframe', len(fragment.data)) data.drop_duplicates(inplace=True) data.to_csv(filepath, encoding='utf-8', mode='w+') log.info('Bulk download saved to %s (%s rows)', filepath, len(data))
def test_drop_duplicates_for_take_all(): df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', 'foo', 'bar', 'qux', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': [1, 1, 2, 2, 2, 2, 1, 2], 'D': lrange(8)}) # single column result = df.drop_duplicates('AAA') expected = df.iloc[[0, 1, 2, 6]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.iloc[[2, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.iloc[[2, 6]] tm.assert_frame_equal(result, expected) # multiple columns result = df.drop_duplicates(['AAA', 'B']) expected = df.iloc[[0, 1, 2, 3, 4, 6]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B'], keep='last') expected = df.iloc[[0, 1, 2, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B'], keep=False) expected = df.iloc[[0, 1, 2, 6]] tm.assert_frame_equal(result, expected)
def test_drop_duplicates_NA_for_take_all(): # none df = DataFrame({'A': [None, None, 'foo', 'bar', 'foo', 'baz', 'bar', 'qux'], 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) # single column result = df.drop_duplicates('A') expected = df.iloc[[0, 2, 3, 5, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') expected = df.iloc[[1, 4, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) expected = df.iloc[[5, 7]] tm.assert_frame_equal(result, expected) # nan # single column result = df.drop_duplicates('C') expected = df.iloc[[0, 1, 5, 6]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[3, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) expected = df.iloc[[5, 6]] tm.assert_frame_equal(result, expected)
def get_components_yahoo(idx_sym): """ Returns DataFrame containing list of component information for index represented in idx_sym from yahoo. Includes component symbol (ticker), exchange, and name. Parameters ---------- idx_sym : str Stock index symbol Examples: '^DJI' (Dow Jones Industrial Average) '^NYA' (NYSE Composite) '^IXIC' (NASDAQ Composite) See: http://finance.yahoo.com/indices for other index symbols Returns ------- idx_df : DataFrame """ stats = 'snx' #URL of form: #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \ '&e=.csv&h={2}' idx_mod = idx_sym.replace('^', '@%5E') urlStr = url.format(idx_mod, stats, 1) idx_df = DataFrame() mask = [True] comp_idx = 1 #LOOP across component index structure, #break when no new components are found while (True in mask): urlStr = url.format(idx_mod, stats, comp_idx) lines = (urllib.urlopen(urlStr).read().decode('utf-8').strip(). strip('"').split('"\r\n"')) lines = [line.strip().split('","') for line in lines] temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) temp_df = temp_df.drop_duplicates() temp_df = temp_df.set_index('ticker') mask = ~temp_df.index.isin(idx_df.index) comp_idx = comp_idx + 50 idx_df = idx_df.append(temp_df[mask]) return idx_df
def get_components_yahoo(idx_sym): """ Returns DataFrame containing list of component information for index represented in idx_sym from yahoo. Includes component symbol (ticker), exchange, and name. Parameters ---------- idx_sym : str Stock index symbol Examples: '^DJI' (Dow Jones Industrial Average) '^NYA' (NYSE Composite) '^IXIC' (NASDAQ Composite) See: http://finance.yahoo.com/indices for other index symbols Returns ------- idx_df : DataFrame """ stats = "snx" # URL of form: # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv url = _YAHOO_COMPONENTS_URL + "s={0}&f={1}&e=.csv&h={2}" idx_mod = idx_sym.replace("^", "@%5E") url_str = url.format(idx_mod, stats, 1) idx_df = DataFrame() mask = [True] comp_idx = 1 # LOOP across component index structure, # break when no new components are found while True in mask: url_str = url.format(idx_mod, stats, comp_idx) with urlopen(url_str) as resp: raw = resp.read() lines = raw.decode("utf-8").strip().strip('"').split('"\r\n"') lines = [line.strip().split('","') for line in lines] temp_df = DataFrame(lines, columns=["ticker", "name", "exchange"]) temp_df = temp_df.drop_duplicates() temp_df = temp_df.set_index("ticker") mask = ~temp_df.index.isin(idx_df.index) comp_idx = comp_idx + 50 idx_df = idx_df.append(temp_df[mask]) return idx_df
def drop_reqpeat01(): data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]}) print data print data.duplicated() print data.drop_duplicates() data['v1']=range(7) print data.drop_duplicates(['k1']) print data print data.drop_duplicates(['k1','k2'],keep='last')
def read_thermo_dat(model='nox', runDir=os.getcwd(), sample_int='1min', start=None, end=None): ''' Reads thermo data from .dat file type. model = one of: nox, sox, or o3 returns the number of files read and DataSeries containing all data munged and organized for the user for the Thermo Scientific line of atmospheric gas analyzers >>>filecount, no = read_thermo_dat('o3', runDir=dataDir) ''' # If the data is in a different directory, change the directory if os.getcwd() != runDir: os.chdir(runDir) # Set the model name based on input if model =='nox': instrument = '42I' elif model == 'sox': instrument = '43I' elif model == 'o3': instrument = '49I' else: sys.exit("The model you defined is not valid or supported yet.") # grab all files in the directory for a given instrument with the .dat file extension files = get_files(instrument,fileType='dat',start=start, end=end, runDir=runDir) fileNo = 1 data = DataFrame() # Concatenate the data from each file together to build one big dataframe for each in files: newData = pd.read_table(each, sep='\s+', skiprows=4, header=5, parse_dates=[[1,0]], keep_date_col=True, index_col='Date_Time', warn_bad_lines=True) data = pd.concat([data, newData]) fileNo += 1 # Create a duplicate column containing the index to easily drop all duplicate rows from merging files containing # the same data data['stamp'] = data.index data = data.drop_duplicates(cols='stamp') # Depending on the model, do some stuff to clean it up if model == 'nox': data['no2'] = data['nox'] - data['no'] # resample the data based on chosen imput data = data.resample(sample_int) return (fileNo, data)
def PrepeareAndSave(uid, dataToSearch, data, fileName="data.csv"): people = [] address = [] poi = [] activity = [] timeB = [] timeE = [] entities = [] taxonomy = [] concepts = [] txt = [] for i in xrange(0, len(dataToSearch["people"])): for j in xrange(0, len(data[i][2])): people.append(dataToSearch["people"][i]) address.append(dataToSearch["address"][i]) poi.append(dataToSearch["poi"][i]) activity.append(dataToSearch["activity"][i]) timeB.append(dataToSearch["timeB"][i]) timeE.append(dataToSearch["timeE"][i]) entities.append(data[i][2][j]) taxonomy.append(data[i][3][j]) concepts.append(data[i][4][j]) txt.append(base64.b64encode(data[i][1][j])) df = DataFrame() df["people"] = people df["address"] = address df["poi"] = poi df["activity"] = activity df["entities"] = entities df["taxonomy"] = taxonomy df["concepts"] = concepts df["timeB"] = timeB df["timeE"] = timeE df["txt"] = txt df = df.drop_duplicates(take_last=True).reset_index().drop(["index"], axis=1) # print df try: df_old = read_csv(ROOT + str(uid) + "/" + fileName, ";") df_new = [df_old, df] df = pd.concat(df_new).drop(["Unnamed: 0"], axis=1) except: print ("New Data") df.to_csv(ROOT + str(uid) + "/" + fileName, sep=";") return df
def slide_10(): data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]}) print data print data.duplicated() print data.duplicated('k1') print data.drop_duplicates() data['v1'] = range(7) print data print data.drop_duplicates(['k1']) print data.drop_duplicates(['k1', 'k2'], take_last=True)
def compute_tf_idf(self): # Find total number of document results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents')) tmp = results.fetchone() total_doc = tmp[0] results = self.cursor.execute('SELECT did, type, entity FROM entities') tmp = results.fetchall() df = DataFrame(tmp, columns=['did', 'e_type', 'entity']) base_df = df[['e_type', 'entity']] base_df = base_df.drop_duplicates() doc_t_df = df.drop_duplicates().groupby('entity').size() results = self.cursor.execute('SELECT did, total_word FROM documents') tmp = results.fetchall() df2 = DataFrame(tmp, columns=['did', 'total_word']) tmp = df[['did', 'entity']].groupby(['did', 'entity']).size().reset_index() tmp.rename(columns={0: 'term_freq'}, inplace=True) tf_idf_list = [] for row in tmp.iterrows(): values = row[1] did = values[0] entity = values[1] term_freq = values[2] total_word = df2[df2['did'] == did]['total_word'].get_values()[0] tf = float(term_freq) / total_word doc_t = doc_t_df.get_value(entity) idf = np.log(total_doc / doc_t) tf_idf = tf * idf tf_idf_list.append([entity, tf_idf]) tf_idf_df = DataFrame(tf_idf_list, columns=['entity', 'tf_idf']) tf_idf_df = tf_idf_df.groupby('entity').agg('sum') base_df.loc[:, 'tf_idf'] = base_df['entity'].apply(lambda x: tf_idf_df['tf_idf'][x]) return base_df
def deal_string02(): import json db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json')) print len(db) print db[0] print db[0].keys() print db[0]['nutrients'][0] nutrients=DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys=['description','group','id','manufacturer'] info=DataFrame(db,columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] nutrients=[] for rec in db: fnuts=DataFrame(rec['nutrients']) fnuts['id']=rec['id'] nutrients.append(fnuts) nutrients=pd.concat(nutrients,ignore_index=True) print nutrients print nutrients.duplicated().sum() nutrients=nutrients.drop_duplicates() col_mapping={'description':'food','group':'fgroup'} info=info.rename(columns=col_mapping,copy=False) print info col_mapping={'description':'nutrient','group':'nutgroup'} nutrients=nutrients.rename(columns=col_mapping,copy=False) print nutrients ndata=pd.merge(nutrients,info,on='id',how='outer') print ndata print ndata.ix[3000] result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5) # print result result['Zinc, Zn'].sort_values().plot(kind='barh') by_nutrient=ndata.groupby(['nutgroup','nutrient']) get_maximum=lambda x:x.xs(x.value.idxmax()) get_minimum=lambda x:x.xs(x.value.idmin()) max_foods=by_nutrient.apply(get_maximum)[['value','food']] max_foods.food=max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def hierarchical_clusters( log, show_plot=None ): """Translates traces to Parikh vectors and computes in the vector space a hierarchical clustering.""" def get_parikh(case,alphabet): v = zeros(len(alphabet),dtype=int) for act in case: v[alphabet[act]] = v[alphabet[act]] +1 # canonical representation m = min(v) return v - m actsind = {} i = 0 for act in log.get_alphabet(): actsind[act] = i i = i +1 uniq_cases = log.get_uniq_cases() N = len(uniq_cases) M = len(actsind) data = zeros((N,M),dtype=int) i = 0 parikhdict = {} for case in uniq_cases.keys(): data[i] = get_parikh(case,actsind) str_i = ','.join(map(str,data[i])) if str_i not in parikhdict: parikhdict[str_i] = [i] else: parikhdict[str_i].append(i) i = i + 1 df = DataFrame(data) data_uniq = df.drop_duplicates() Y = pdist(data_uniq,metric='euclidean') Z = linkage(Y,method='average') dendrogram(Z) show()
def fix_event_type(df: DataFrame): ''' Not sure yet. :param df: Dataframe object. :return: Modified Dataframe. ''' a = time.time() colsf = df['id'].ravel() # list of all IDs unique = pd.Series(colsf).unique() # get unique IDs u_counts = [] # list of unique counts (UNUSED) counts_bucket = [] # bucket of counts (UNUSED) df = pd.get_dummies(df) # create dummy variables todrop = df.sum() < 50 # get columns where sum of dummy column < 50 dropcols = df.columns[todrop] # get those column names df = df.drop(dropcols, axis=1) # drop those columns df['num_events'] = 0 # create number of events columns, set to 0 # print(df.columns) print(str(len(unique))) for ii in range(0,len(unique)): # loop through all the unique IDs subset = df.loc[df['id'] == unique[ii]] # subset by that ID the_dummies = subset.columns != 'id' # get all columns that do not equal that ID aa = subset.iloc[:, subset.columns != 'id'].sum().tolist() # get all of those columns to list event_sum = np.sum(aa) # sum all of those # aa = aa.set_index([[subset.index[0]]]) # subset.iloc[:,subset.columns != 'id'] = aa df = df.set_value(subset.index, the_dummies, aa) df = df.set_value(subset.index, 'num_events', event_sum) # df.loc[subset.index] = subset df = df.drop_duplicates('id') print(df) b = time.time() print(b-a) return df
class DropDuplicates(object): goal_time = 0.2 params = [True, False] param_names = ['inplace'] def setup(self, inplace): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) self.df = DataFrame({'key1': key1, 'key2': key2, 'value': np.random.randn(N * K)}) self.df_nan = self.df.copy() self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) N = 1000000 K = 10000 key1 = np.random.randint(0, K, size=N) self.df_int = DataFrame({'key1': key1}) self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool)) def time_frame_drop_dups(self, inplace): self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) def time_frame_drop_dups_na(self, inplace): self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) def time_series_drop_dups_int(self, inplace): self.s.drop_duplicates(inplace=inplace) def time_series_drop_dups_string(self, inplace): self.s_str.drop_duplicates(inplace=inplace) def time_frame_drop_dups_int(self, inplace): self.df_int.drop_duplicates(inplace=inplace) def time_frame_drop_dups_bool(self, inplace): self.df_bool.drop_duplicates(inplace=inplace)
def _drop_duplicates_by_timestamp(self, heating_obj_df: pd.DataFrame ) -> pd.DataFrame: return heating_obj_df.drop_duplicates(column_names.TIMESTAMP, ignore_index=True, keep="last")
import numpy as np import pandas as pd from pandas import Series,DataFrame DF_obj=DataFrame({'column 1':[1,1,2,2,3,3,3], 'column 2':['a','a','b','b','c','c','c'], 'column 3':['A','A','B','B','C','C','C']}) print(DF_obj) print(DF_obj.duplicated()) DF_obj=DF_obj.drop_duplicates() print(DF_obj) print(DF_obj.drop_duplicates(['column 3']))
def load_terms(df: DataFrame, term_creation_mode: str = 'ignore') -> int: """Creates Term objects from an input Pandas DataFrame and adds the newly created Terms to the database. Args: df (pandas.DataFrame): Input data. Contains columns: * 'Term Locale' * 'Term Category' * 'Term' * 'Case Sensitive'. term_creation_mode (str): A logical flag for handling duplicate Term conflicts. Can be 'add', 'ignore', or 'replace'. Defaults to 'ignore'. Only relevant if a Term already exists in the database. * 'add': adds the new Terms while keeping the existing Terms. * 'ignore': skips adding new Terms. * 'replace': deletes the old Term(s) and then adds the new Term. Returns: int: The number of Terms added to the database. Raises: ValueError: invalid `term_creation_mode` argument. """ @unique class _TermCreationMode(Enum): ADD = 'add' IGNORE = 'ignore' REPLACE = 'replace' @classmethod def get_modes(self, *args) -> list: def _ga(mode, *args) -> tuple: return tuple([getattr(mode, arg) for arg in args]) if len(args) == 1: return [getattr(mode, *args) for mode in self] elif len(args) == 0: args = ('name', 'value') return [_ga(mode, args) for mode in self] def _prepare_term( dataframe_row: Series, term_creation_mode: _TermCreationMode) -> Union[Term, None]: """Instantiates and returns a new Term objects based on input data. Args: dataframe_row (pandas.Series): Input data for Term creation. term_creation_mode (_TermCreationMode): A logical flag; determines return behavior. Returns: new_db_term (Term): A new Term object from the input row. None: if term_creation_mode is IGNORE. """ term = dataframe_row['Term'].strip() qs_terms_of_this_term = Term.objects.filter(term=term) # instantiate a new Term new_db_term = Term(term=term, source=dataframe_row['Term Category'], definition_url=dataframe_row['Term Locale']) # handle term creation modes if qs_terms_of_this_term.exists(): if term_creation_mode == _TermCreationMode.REPLACE: qs_terms_of_this_term.delete() elif term_creation_mode == _TermCreationMode.IGNORE: return None return new_db_term try: # set term creation mode term_creation_mode = _TermCreationMode(term_creation_mode) except ValueError: # raise custom argument error raise ValueError( f"Argument `term_creation_mode` must be one of: {_TermCreationMode.get_modes('values')}" ) # preprocess DataFrame df.drop_duplicates(inplace=True) df.loc[df['Case Sensitive'] == False, 'Term'] = df.loc[df['Case Sensitive'] == False, 'Term'].str.lower() df = df.drop_duplicates(subset='Term').dropna(subset=['Term']) # create new Terms. Filter: if _prepare_term() returns None, it is not added to this list new_db_terms = list( filter(None, (_prepare_term(row, term_creation_mode) for _, row in df.iterrows()))) # cache "global" term stems step - should be cached here via model manager Term.objects.bulk_create(new_db_terms) return len(new_db_terms)
def test_drop_duplicates_NA(): # none df = DataFrame({ "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], "B": ["one", "one", "two", "two", "two", "two", "one", "two"], "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], "D": range(8), }) # single column result = df.drop_duplicates("A") expected = df.loc[[0, 2, 3]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("A", keep="last") expected = df.loc[[1, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("A", keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column result = df.drop_duplicates(["A", "B"]) expected = df.loc[[0, 2, 3, 6]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(["A", "B"], keep="last") expected = df.loc[[1, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(["A", "B"], keep=False) expected = df.loc[[6]] tm.assert_frame_equal(result, expected) # nan df = DataFrame({ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], "B": ["one", "one", "two", "two", "two", "two", "one", "two"], "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], "D": range(8), }) # single column result = df.drop_duplicates("C") expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("C", keep="last") expected = df.loc[[3, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("C", keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column result = df.drop_duplicates(["C", "B"]) expected = df.loc[[0, 1, 2, 4]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(["C", "B"], keep="last") expected = df.loc[[1, 3, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(["C", "B"], keep=False) expected = df.loc[[1]] tm.assert_frame_equal(result, expected)
def test_drop_duplicates(): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': [1, 1, 2, 2, 2, 2, 1, 2], 'D': lrange(8)}) # single column result = df.drop_duplicates('AAA') expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.loc[[]] tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B']) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep='last') expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep=False) expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # consider everything df2 = df.loc[:, ['AAA', 'B', 'C']] result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(['AAA', 'B']) tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep='last') expected = df2.drop_duplicates(['AAA', 'B'], keep='last') tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(['AAA', 'B'], keep=False) tm.assert_frame_equal(result, expected) # integers result = df.drop_duplicates('C') expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) df['E'] = df['C'].astype('int8') result = df.drop_duplicates('E') expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('E', keep='last') expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) # GH 11376 df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], 'y': [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) df = DataFrame([[1, 0], [0, 2]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-2, 0], [0, -4]]) tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 df = DataFrame([[-x, x], [0, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-x, x], [x, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0
series_obj = Series( ['row 1', missing, 'row 3', 'row 4', 'row 5', missing, 'row 7', 'row 8']) print(series_obj) DF_obj = DataFrame({ 'column 1': [1, 1, 2, 2, 3, 3, 3], 'column 2': ['a', 'a', 'b', 'b', 'c', 'c', 'c'], 'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'C'] }) print(DF_obj) print(DF_obj.duplicated()) #Check duplicates print(DF_obj.drop_duplicates()) DF_obj = DataFrame({ 'column 1': [1, 1, 2, 2, 3, 3, 3], 'column 2': ['a', 'a', 'b', 'b', 'c', 'c', 'c'], 'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'C'] }) print(DF_obj) print(DF_obj.drop_duplicates( ['column 3'])) #look for dupicates in col 3 and drop those rows ####################################Concatenate########################################### DF_obj = DataFrame(np.arange(36).reshape(6, 6))
def df_drop_duplicates(df: pd.DataFrame) -> pd.DataFrame: return df.drop_duplicates( subset=[Column.YM_TRACK_ID.value, Column.YM_ALBUM_ID.value])
# 现在,为了对全部营养数据做一些分析,最简单的办法是将所有食物的营养成分整合到一个大表中 # 我们分几个步骤来实现该目的 # 首先,将各食物的营养成分列表转换为一个DataFrame,并添加一个表示编号的列, # 然后将该DataFrame添加到一个列表中 # 最后通过concat将这些东西连接起来就可以了 nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) # 如果一切顺利的话,nutrients应该是下面这样的 print(nutrients) # 我们发现这个DataFrame中无论如何都会有一些重复项,所以直接丢弃就可以了 print(nutrients.duplicated().sum()) nutrients = nutrients.drop_duplicates() # 由于两个DataFrame对象中都有“group”和“description”,所以为了明确到底谁是谁 # 需要对它们进行重命名 col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) print(info) col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) print(nutrients) # 做完这些事情之后,就可以将info和nutrients合并起来 ndata = pd.merge(nutrients, info, on='id', how='outer') print(ndata) print(ndata.ix[3000]) # 根据食物分类和营养类型画出一张中位值图 result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh')
def df_to_db(df: pd.DataFrame, ref_df: pd.DataFrame, region: Region, data_schema: DeclarativeMeta, provider: Provider, drop_duplicates: bool = True, fix_duplicate_way: str = 'ignore', force_update=False) -> object: step1 = time.time() precision_str = '{' + ':>{},.{}f'.format(8, 4) + '}' if not pd_is_not_null(df): return 0 if drop_duplicates and df.duplicated(subset='id').any(): df.drop_duplicates(subset='id', keep='last', inplace=True) schema_cols = get_schema_columns(data_schema) cols = list(set(df.columns.tolist()) & set(schema_cols)) if not cols: logger.error("{} get columns failed".format(data_schema.__tablename__)) return 0 df = df[cols] # force update mode, delete duplicate id data in db, and write new data back if force_update: ids = df["id"].tolist() if len(ids) == 1: sql = f"delete from {data_schema.__tablename__} where id = '{ids[0]}'" else: sql = f"delete from {data_schema.__tablename__} where id in {tuple(ids)}" session = get_db_session(region=region, provider=provider, data_schema=data_schema) session.execute(sql) session.commit() df_new = df else: if ref_df is None: ref_df = get_data(region=region, provider=provider, columns=['id', 'timestamp'], data_schema=data_schema, return_type='df') if ref_df.empty: df_new = df else: df_new = df[~df.id.isin(ref_df.index)] # 不能单靠ID决定是否新增,要全量比对 # if fix_duplicate_way == 'add': # df_add = df[df.id.isin(ref_df.index)] # if not df_add.empty: # df_add.id = uuid.uuid1() # df_new = pd.concat([df_new, df_add]) cost = precision_str.format(time.time() - step1) logger.debug("remove duplicated: {}".format(cost)) saved = 0 if pd_is_not_null(df_new): saved = to_postgresql(region, df_new, data_schema.__tablename__) cost = precision_str.format(time.time() - step1) logger.debug("write db: {}, size: {}".format(cost, saved)) return saved
# bunch of keys for testing keys: List[Any] = [ np.random.randint(0, 11, m), np.random.choice(list("abcdefghijk"), m), np.random.choice(pd.date_range("20141009", periods=11).tolist(), m), np.random.choice(list("ZYXWVUTSRQP"), m), ] keys = list(map(tuple, zip(*keys))) keys += list(map(lambda t: t[:-1], vals[:: n // m])) # covers both unique index and non-unique index df = DataFrame(vals, columns=cols) a = pd.concat([df, df]) b = df.drop_duplicates(subset=cols[:-1]) def validate(mi, df, key): # check indexing into a multi-index before & past the lexsort depth mask = np.ones(len(df)).astype("bool") # test for all partials of this key for i, k in enumerate(key): mask &= df.iloc[:, i] == k if not mask.any(): assert key[: i + 1] not in mi.index continue
def _remove_duplicate_entries(df: pd.DataFrame, column_name: str) -> pd.DataFrame: logger.info('Removing duplicate entries') df.drop_duplicates(subset=[column_name], keep='first', inplace=True) return df
class DFTrack: def __init__(self, df_points=None, columns=None): if df_points is None: self.df = DataFrame() if isinstance(df_points, pd.DataFrame): self.df = df_points else: if columns is None: columns = [ 'CodeRoute', 'Latitude', 'Longitude', 'Altitude', 'Date', 'Speed', 'TimeDifference', 'Distance', 'FileName' ] self.df = DataFrame(df_points, columns=columns) def export(self, filename='exported_file', export_format='csv'): """ Export a data frame of DFTrack to JSON or CSV. Parameters ---------- export_format: string Format to export: JSON or CSV filename: string Name of the exported file """ if export_format.lower() == 'json': self.df.reset_index().to_json(orient='records', path_or_buf=filename + '.json') elif export_format.lower() == 'csv': self.df.to_csv(path_or_buf=filename + '.csv') else: raise TrackException('Must specify a valid format to export', "'%s'" % export_format) def getTracks(self): """ Makes a copy of the DFTrack. Explanation: http://stackoverflow.com/questions/27673231/why-should-i-make-a-copy-of-a-data-frame-in-pandas Returns ------- copy: DFTrack The copy of DFTrack. """ return self.__class__(self.df.copy(), list(self.df)) def sort(self, column_name): """ Sorts the data frame by the specified column. Parameters ---------- column_name: string Column name to sort Returns ------- sort: DFTrack DFTrack sorted """ if isinstance(column_name, list): for column in column_name: if column not in self.df: raise TrackException('Column name not found', "'%s'" % column) else: if column_name not in self.df: raise TrackException('Column name not found', "'%s'" % column_name) return self.__class__(self.df.sort_values(column_name), list(self.df)) def getTracksByPlace(self, place, timeout=10, only_points=True): """ Gets the points of the specified place searching in Google's API and, if it does not get anything, it tries with OpenStreetMap's API. Parameters ---------- place: string Place to get the points timeout: int Time, in seconds, to wait for the geocoding service to respond before returning a None value. only_points: boolean True to retrieve only the points that cross a place. False to retrive all the points of the tracks that cross a place. Returns ------- place: DFTrack A DFTrack with the points of the specified place or None if anything is found. """ track_place = self.getTracksByPlaceGoogle(place, timeout=timeout, only_points=only_points) if track_place is not None: return track_place track_place = self.getTracksByPlaceOSM(place, timeout=timeout, only_points=only_points) if track_place is not None: return track_place return None def getTracksByPlaceGoogle(self, place, timeout=10, only_points=True): """ Gets the points of the specified place searching in Google's API. Parameters ---------- place: string Place to get the points timeout: int Time, in seconds, to wait for the geocoding service to respond before returning a None value. only_points: boolean True to retrieve only the points that cross a place. False to retrive all the points of the tracks that cross a place. Returns ------- place: DFTrack A DFTrack with the points of the specified place or None if anything is found. """ try: geolocator = geopy.GoogleV3() location = geolocator.geocode(place, timeout=timeout) except geopy.exc.GeopyError as geo_error: return None southwest_lat = float( location.raw['geometry']['bounds']['southwest']['lat']) northeast_lat = float( location.raw['geometry']['bounds']['northeast']['lat']) southwest_lng = float( location.raw['geometry']['bounds']['southwest']['lng']) northeast_lng = float( location.raw['geometry']['bounds']['northeast']['lng']) df_place = self.df[(self.df['Latitude'] < northeast_lat) & (self.df['Longitude'] < northeast_lng) & (self.df['Latitude'] > southwest_lat) & (self.df['Longitude'] > southwest_lng)] if only_points: return self.__class__(df_place) track_list = df_place['CodeRoute'].unique().tolist() return self.__class__(self.df[self.df['CodeRoute'].isin(track_list)]) def getTracksByPlaceOSM(self, place, timeout=10, only_points=True): """ Gets the points of the specified place searching in OpenStreetMap's API. Parameters ---------- place: string Place to get the points timeout: int Time, in seconds, to wait for the geocoding service to respond before returning a None value. only_points: boolean True to retrieve only the points that cross a place. False to retrive all the points of the tracks that cross a place. Returns ------- place: DFTrack A DFTrack with the points of the specified place or None if anything is found. """ try: geolocator = geopy.Nominatim() location = geolocator.geocode(place, timeout=timeout) except geopy.exc.GeopyError as geo_error: return None southwest_lat = float(location.raw['boundingbox'][0]) northeast_lat = float(location.raw['boundingbox'][1]) southwest_lng = float(location.raw['boundingbox'][2]) northeast_lng = float(location.raw['boundingbox'][3]) df_place = self.df[(self.df['Latitude'] < northeast_lat) & (self.df['Longitude'] < northeast_lng) & (self.df['Latitude'] > southwest_lat) & (self.df['Longitude'] > southwest_lng)] if only_points: return self.__class__(df_place) track_list = df_place['CodeRoute'].unique().tolist() return self.__class__(self.df[self.df['CodeRoute'].isin(track_list)]) def getTracksByDate(self, start=None, end=None, periods=None, freq='D'): """ Gets the points of the specified date range using various combinations of parameters. 2 of 'start', 'end', or 'periods' must be specified. Date format recommended: 'yyyy-mm-dd' Parameters ---------- start: date Date start period end: date Date end period periods: int Number of periods. If None, must specify 'start' and 'end' freq: string Frequency of the date range Returns ------- df_date: DFTrack A DFTrack with the points of the specified date range. """ if trk_utils.isTimeFormat(start) or trk_utils.isTimeFormat(end): raise TrackException('Must specify an appropiate date format', 'Time format found') rng = pd.date_range(start=start, end=end, periods=periods, freq=freq) df_date = self.df.copy() df_date['Date'] = pd.to_datetime(df_date['Date']) df_date['ShortDate'] = df_date['Date'].apply( lambda date: date.date().strftime('%Y-%m-%d')) df_date = df_date[df_date['ShortDate'].apply(lambda date: date in rng)] del df_date['ShortDate'] df_date = df_date.reset_index(drop=True) return self.__class__(df_date, list(df_date)) def getTracksByTime(self, start, end, include_start=True, include_end=True): """ Gets the points between the specified time range. Parameters ---------- start: datetime.time Time start period end: datetime.time Time end period include_start: boolean include_end: boolean Returns ------- df_time: DFTrack A DFTrack with the points of the specified date and time periods. """ if not trk_utils.isTimeFormat(start) or not trk_utils.isTimeFormat( end): raise TrackException('Must specify an appropiate time format', trk_utils.TIME_FORMATS) df_time = self.df.copy() index = pd.DatetimeIndex(df_time['Date']) df_time = df_time.iloc[index.indexer_between_time( start_time=start, end_time=end, include_start=include_start, include_end=include_end)] df_time = df_time.reset_index(drop=True) return self.__class__(df_time, list(df_time)) def pointVideoNormalize(self): df = self.df.copy() df_norm = pd.DataFrame() group_size = df.groupby('CodeRoute').size() max_value = group_size.max() name_max_value = group_size.idxmax() grouped = df['CodeRoute'].unique() for name in tqdm(grouped, desc='Groups'): df_slice = df[df['CodeRoute'] == name] df_slice = df_slice.reset_index(drop=True) div = int(max_value / len(df_slice)) + 1 df_index = DataFrame(df_slice.index) df_slice['VideoFrame'] = df_index.apply( lambda x: x + 1 if name_max_value == name else x * div) df_norm = pd.concat([df_norm, df_slice]) df_norm = df_norm.reset_index(drop=True) return self.__class__(df_norm, list(df_norm)) def timeVideoNormalize(self, time, framerate=5): df = self.df.copy() if time == 0: df['VideoFrame'] = 0 df = df.reset_index(drop=True) return self.__class__(df, list(df)) n_fps = time * framerate df = df.sort_values('Date') df_cum = trk_utils.calculateCumTimeDiff(df) grouped = df_cum['CodeRoute'].unique() df_norm = pd.DataFrame() point_idx = 1 for name in tqdm(grouped, desc='Groups'): df_slice = df_cum[df_cum['CodeRoute'] == name] time_diff = float( (df_slice[['TimeDifference']].sum() / time) / framerate) # Track duration divided by time and framerate df_range = df_slice[df_slice['CumTimeDiff'] == 0] df_range = df_range.reset_index(drop=True) df_range['VideoFrame'] = 0 df_norm = pd.concat([df_norm, df_range]) for i in tqdm(range(1, n_fps + 1), desc='Num FPS', leave=False): x_start = time_diff * (i - 1) x_end = time_diff * i df_range = df_slice[(df_slice['CumTimeDiff'] > x_start) & (df_slice['CumTimeDiff'] <= x_end)] df_range = df_range.reset_index(drop=True) if df_range.empty: df_start = df_slice[ df_slice['CumTimeDiff'] <= x_start].tail(1) df_end = df_slice[df_slice['CumTimeDiff'] > x_end].head(1) if not df_start.empty and not df_end.empty: df_middlePoint = trk_utils.getPointInTheMiddle( df_start, df_end, time_diff, point_idx) df_range = DataFrame(df_middlePoint, columns=list(df_cum)) point_idx = point_idx + 1 else: point_idx = 1 df_range['VideoFrame'] = i df_norm = pd.concat([df_norm, df_range]) df_norm = df_norm.reset_index(drop=True) return self.__class__(df_norm, list(df_norm)) def setColors(self, column_name, individual_tracks=True): if column_name not in self.df: raise TrackException('Column name not found', "'%s'" % column_name) df = self.df.copy() df_colors = pd.DataFrame() if individual_tracks: grouped = df['CodeRoute'].unique() for name in grouped: df_slice = df[df['CodeRoute'] == name] df_slice = df_slice.reset_index(drop=True) min = df_slice[column_name].min() max = df_slice[column_name].max() df_slice['Color'] = df_slice[column_name].apply(trk_utils.rgb, minimum=min, maximum=max) df_colors = pd.concat([df_colors, df_slice]) df_colors = df_colors.reset_index(drop=True) return self.__class__(df_colors, list(df_colors)) else: min = df[column_name].min() max = df[column_name].max() df['Color'] = df[column_name].apply(trk_utils.rgb, minimum=min, maximum=max) df = df.reset_index(drop=True) return self.__class__(df, list(df)) def dropDuplicates(self): """ Drop points of the same track with the same Latitude and Longitude. """ return self.__class__( self.df.drop_duplicates(['CodeRoute', 'Latitude', 'Longitude'])) def toDict(self): """ Convert de data frame to a dictionary like [{column -> value}, ... , {column -> value}] """ return self.df.to_dict('records') def getBounds(self): """ Get the bounds of the DFTrack Returns ------- bounds: gpxpy.GPXBounds """ min_lat = self.df['Latitude'].min() max_lat = self.df['Latitude'].max() min_lng = self.df['Longitude'].min() max_lng = self.df['Longitude'].max() return GPXBounds(min_lat, max_lat, min_lng, max_lng) def concat(self, dfTrack): """ Concatenate DFTrack objects with 'self' Parameters ---------- dfTrack: DFTrack or list of DFTrack The ones that will be joined with 'self' Returns ------- df_concat: DFTrack A DFTrack with the all the DFTrack concatenated """ if not isinstance(dfTrack, list): # If it is not a list of DFTrack, make a list of one element dfTrack = [dfTrack] df_concat = [self.df] # First element is 'self' # From list of 'dfTrack', create a list of their dataframes for df in dfTrack: if not isinstance(df, DFTrack): raise TrackException("Parameter must be a 'DFTrack' object", '%s found' % type(df)) df_concat.append(df.df) return self.__class__(pd.concat(df_concat))
# 6 6 b # 7 6 b print(dupli_data.duplicated()) # 0 False # 1 False # 2 False # 3 False # 4 False # 5 True # 6 False # 7 True # dtype: bool dupli_data = dupli_data.drop_duplicates() print(dupli_data) # col1 col2 # 0 1 a # 1 1 b # 2 2 b # 3 3 b # 4 4 c # 6 6 b city_map ={"서울":"서울", "부산":"경상도", "대전":"충청도", "광주":"전라도",
def test_drop_duplicates(): df = DataFrame({ "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], "B": ["one", "one", "two", "two", "two", "two", "one", "two"], "C": [1, 1, 2, 2, 2, 2, 1, 2], "D": range(8), }) # single column result = df.drop_duplicates("AAA") expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("AAA", keep="last") expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("AAA", keep=False) expected = df.loc[[]] tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(["AAA", "B"])) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(("AAA", "B"), keep="last") expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(("AAA", "B"), keep=False) expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # consider everything df2 = df.loc[:, ["AAA", "B", "C"]] result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep="last") expected = df2.drop_duplicates(["AAA", "B"], keep="last") tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(["AAA", "B"], keep=False) tm.assert_frame_equal(result, expected) # integers result = df.drop_duplicates("C") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("C", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) df["E"] = df["C"].astype("int8") result = df.drop_duplicates("E") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("E", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) # GH 11376 df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) df = DataFrame([[1, 0], [0, 2]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-2, 0], [0, -4]]) tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 df = DataFrame([[-x, x], [0, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-x, x], [x, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ["first", "last", False]: assert df.duplicated(keep=keep).sum() == 0
print("(attempt " + str(i + 1) + "/" + str(CHUNK_ATTEMPTS) + ")") if i < CHUNK_ATTEMPTS - 1: time.sleep(CHUNK_ATTEMPT_SLEEP) # time taken for this loop to execute # print("time taken to execute loop took ", time.time() - start_time) if error != None: sys.exit("max retries exceeded.") # f.write(json.dumps(messages)) # f.close() # print() # print("saving to " + file_name) # print("done2!") dic2['vid'] = vid dic2['commenter'] = commenter dic2['time'] = time_stamp dic2['comment'] = comment_list df2 = DataFrame(data = dic2) df2 = df2.append(df2) df2.drop_duplicates(subset = None, inplace = True) df2.to_csv(chosenDirectory+"/esports_videos_comments/" + name[0] + "_" + name[1] + "_" + str(time.time()) + "_comments.csv", sep=',') # Save for each channel # df = DataFrame(data = details) # df.to_csv(chosenDirectory+"/channel_comments/" + channel + "_most_viewes.csv") except: print(str(datetime.datetime.now())+": Video broken or something")
# data drop by column or index base.chapter('data drop by column & index') df3 = df2.drop(columns=['chinese']) df4 = df3.drop(index=['Zhangfei']) print("df3:\n", df3) print("df4:\n", df4) # rename columns base.chapter('rename columns') # same as df2.rename(columns={'chinese':'Yuwen', 'english':'Yingyu'}, inplace=True) df3 = df2.rename(columns={'chinese':'Yuwen', 'english':'Yingyu'}, inplace=False) print("df2:\n", df3) # drop duplicates base.chapter('drop duplicate') df1 = df1.drop_duplicates() # space character strip base.chapter('space character strip') # strip & lstrip & rstrip #df2['chinese'] = df2['chinese'].map(str.strip) # upper & lower & title df2.columns = df2.columns.str.upper() df2.columns = df2.columns.str.lower() # find null data = { "chinese": [66, 95, 93, 90, 80], "english": [65, None, 92, 88, 90], "math": [30, 98, 96, None, 90],
def detect_ts(df, max_anoms=0.10, direction='pos', alpha=0.05, threshold=None, e_value=False, longterm=False, piecewise_median_period_weeks=2, granularity='day', verbose=False, inplace=True): """ Anomaly Detection Using Seasonal Hybrid ESD Test A technique for detecting anomalies in seasonal univariate time series where the input is a series of <timestamp, value> pairs. Args: x: Time series as a two column data frame where the first column consists of the integer UTC Unix timestamps and the second column consists of the observations. max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the data. direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both'). alpha: The level of statistical significance with which to accept or reject anomalies. only_last: Find and report anomalies only within the last day or hr in the time series. Options: (None | 'day' | 'hr') threshold: Only report positive going anoms above the threshold specified. Options are: (None | 'med_max' | 'p95' | 'p99') e_value: Add an additional column to the anoms output containing the expected value. longterm: Increase anom detection efficacy for time series that are greater than a month. See Details below. piecewise_median_period_weeks: The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal (2014). Defaults to 2. Details 'longterm' This option should be set when the input time series is longer than a month. The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014). 'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller than one of the specified thresholds which include: the median of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the 99th percentile of the daily max values (p99). The returned value is a dictionary with the following components: anoms: Data frame containing timestamps, values, and optionally expected values. plot: A graphical object if plotting was requested by the user. The plot contains the estimated anomalies annotated on the input time series """ if not isinstance(df, DataFrame): raise ValueError("data must be a single data frame.") else: if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all(): raise ValueError( '''data must be a 2 column data.frame, with the first column being a set of timestamps, and the second coloumn being numeric values.''') if not (df.dtypes[0].type is np.float64) and not (df.dtypes[0].type is np.int64): raise ValueError( '''The input timestamp column must be a float or integer of the unix timestamp, not date time columns, date strings or pd.TimeStamp columns.''' ) if not inplace: df = copy.deepcopy(df) # change the column names in place, rather than copying the entire dataset, but save the headers to replace them. orig_header = df.columns.values df.rename(columns={ df.columns.values[0]: "timestamp", df.columns.values[1]: "value" }, inplace=True) # Sanity check all input parameters if max_anoms > 0.49: length = len(df.value) raise ValueError( "max_anoms must be less than 50%% of the data points (max_anoms =%f data_points =%s)." % (round(max_anoms * length, 0), length)) if direction not in ['pos', 'neg', 'both']: raise ValueError("direction options are: pos | neg | both.") if not (0.01 <= alpha or alpha <= 0.1): if verbose: import warnings warnings.warn( "alpha is the statistical signifigance, and is usually between 0.01 and 0.1" ) if threshold not in [None, 'med_max', 'p95', 'p99']: raise ValueError("threshold options are: None | med_max | p95 | p99") if not isinstance(e_value, bool): raise ValueError("e_value must be a boolean") if not isinstance(longterm, bool): raise ValueError("longterm must be a boolean") if piecewise_median_period_weeks < 2: raise ValueError( "piecewise_median_period_weeks must be at greater than 2 weeks") # if the data is daily, then we need to bump the period to weekly to get multiple examples gran = granularity gran_period = {'ms': 60000, 'sec': 3600, 'min': 1440, 'hr': 24, 'day': 7} period = gran_period.get(gran) if not period: raise ValueError( '%s granularity detected. This is currently not supported.' % (gran, )) # now convert the timestamp column into a proper timestamp df['timestamp'] = df['timestamp'].map( lambda x: datetime.datetime.utcfromtimestamp(x)) num_obs = len(df.value) clamp = (1 / float(num_obs)) if max_anoms < clamp: max_anoms = clamp if longterm: if gran == "day": num_obs_in_period = period * piecewise_median_period_weeks + 1 num_days_in_period = 7 * piecewise_median_period_weeks + 1 else: num_obs_in_period = period * 7 * piecewise_median_period_weeks num_days_in_period = 7 * piecewise_median_period_weeks last_date = df.timestamp.iget(-1) all_data = [] for j in range(0, len(df.timestamp), num_obs_in_period): start_date = df.timestamp.iget(j) end_date = min( start_date + datetime.timedelta(days=num_obs_in_period), df.timestamp.iget(-1)) # if there is at least 14 days left, subset it, otherwise subset last_date - 14days if (end_date - start_date).days == num_days_in_period: sub_df = df[(df.timestamp >= start_date) & (df.timestamp < end_date)] else: sub_df = df[(df.timestamp > ( last_date - datetime.timedelta(days=num_days_in_period))) & (df.timestamp <= last_date)] all_data.append(sub_df) else: all_data = [df] all_anoms = DataFrame(columns=['timestamp', 'value']) seasonal_plus_trend = DataFrame(columns=['timestamp', 'value']) # Detect anomalies on all data (either entire data in one-pass, or in 2 week blocks if longterm=TRUE) for i in range(len(all_data)): directions = { 'pos': Direction(True, True), 'neg': Direction(True, False), 'both': Direction(False, True) } anomaly_direction = directions[direction] # detect_anoms actually performs the anomaly detection and returns the result in a list containing the anomalies # as well as the decomposed components of the time series for further analysis. s_h_esd_timestamps = detect_anoms( all_data[i], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=True, one_tail=anomaly_direction.one_tail, upper_tail=anomaly_direction.upper_tail, verbose=verbose) if s_h_esd_timestamps is None: return {'anoms': DataFrame(columns=["timestamp", "anoms"])} # store decomposed comps in local variable and overwrite s_h_esd_timestamps to contain only the anom timestamps data_decomp = s_h_esd_timestamps['stl'] s_h_esd_timestamps = s_h_esd_timestamps['anoms'] # -- Step 3: Use detected anomaly timestamps to extract the actual anomalies (timestamp and value) from the data if s_h_esd_timestamps: anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)] else: anoms = DataFrame(columns=['timestamp', 'value']) # Filter the anomalies using one of the thresholding functions if applicable if threshold: # Calculate daily max values periodic_maxes = df.groupby(df.timestamp.map( Timestamp.date)).aggregate(np.max).value # Calculate the threshold set by the user thresh = 0.5 if threshold == 'med_max': thresh = periodic_maxes.median() elif threshold == 'p95': thresh = periodic_maxes.quantile(.95) elif threshold == 'p99': thresh = periodic_maxes.quantile(.99) # Remove any anoms below the threshold anoms = anoms[anoms.value >= thresh] all_anoms = all_anoms.append(anoms) seasonal_plus_trend = seasonal_plus_trend.append(data_decomp) # Cleanup potential duplicates try: all_anoms.drop_duplicates(subset=['timestamp']) seasonal_plus_trend.drop_duplicates(subset=['timestamp']) except TypeError: all_anoms.drop_duplicates(cols=['timestamp']) seasonal_plus_trend.drop_duplicates(cols=['timestamp']) # Calculate number of anomalies as a percentage anom_pct = (len(df.value) / float(num_obs)) * 100 # name the columns back df.rename(columns={ "timestamp": orig_header[0], "value": orig_header[1] }, inplace=True) if anom_pct == 0: return {"anoms": None} all_anoms.index = all_anoms.timestamp if e_value: d = { 'timestamp': all_anoms.timestamp, 'anoms': all_anoms.value, 'expected_value': seasonal_plus_trend[seasonal_plus_trend.timestamp.isin( all_anoms.timestamp)].value } else: d = {'timestamp': all_anoms.timestamp, 'anoms': all_anoms.value} anoms = DataFrame(d, index=d['timestamp'].index) # convert timestamps back to unix time anoms['timestamp'] = anoms['timestamp'].astype(np.int64) anoms['timestamp'] = anoms['timestamp'].map(lambda x: x * 10e-10) return {'anoms': anoms}
def get_different_coef_by_class(data_all, type_name, ds_list): # 定义参考的维度列,后期用户价值需要根据此计算 cols = [ 'ora_source', 'ora_channel', 'ora_activity_type', 'ora_parent_channel', 'ora_channel_type', 'ora_business_type' ] xishu_table = [] colList = get_descending_list_order(cols) # colList = [['ora_business_type']] # print colList for col in colList: # print col data = data_all[data_all['type'] == type_name].groupby(col + ['ds']).sum() data = data.reset_index() data = add_columns(data) # #测试 # if 'ora_channel' not in col: # continue # print data.head(1) for index1, row in data[(data['ds'] >= ds_list['ds_0']) & (data['ds'] <= ds_list['ds_1'] )][col].drop_duplicates().iterrows(): # print row contition = gen_condition_str_by_series(row).decode('utf8') # print contition # print a head_col = collections.OrderedDict() head_col = collections.OrderedDict.fromkeys(cols) for i in head_col: if i in col: head_col[i] = row[i] # data[i] = data[i].apply(lambda a: str(a).decode('utf8')) else: head_col[i] = '-' # print head_col data1 = data[eval(contition)] data1 = delete_exception_value(data1) if data1.empty: continue if type_name in [ 'e_supplier_profit', 'common_order_profit', 'tao2_order_profit', 'zy_order_profit' ]: coef_dict = get_coef_of_e_supplier(data1, ds_list) if type_name in [ 'ad_profit', 'ad_common_profit', 'ad_licai_profit', 'licai_order_profit' ]: coef_dict = get_coef_of_ad_or_licai(data1, ds_list) # print coef_dict coef_dict = collections.OrderedDict(head_col.items() + coef_dict.items()) xishu_table.append(coef_dict) # print xishu_table # break df = DataFrame(xishu_table) df = df.drop_duplicates() return df
# # print(df2) # # # df3 = pd.merge(df1, df2, on="name") # df3 = pd.merge(df1, df2, how='outer' ) # print(df3) # from pandasql import sqldf, load_births, load_meat # df1 = DataFrame({'name':['ZhangFei', 'GuanYu', 'a', 'b', 'c'], 'data1':range(5)}) # pysqldf = lambda sql: sqldf(sql, globals()) # sql = "select * from df1 where name ='ZhangFei'" # print(pysqldf(sql)) def sum(df: DataFrame): df['总分'] = df["中文"] + df["英语"] + df["数学"] return df df = DataFrame(pd.read_csv("data.txt")) df.rename(columns={ "Chinese": "中文", "English": "英语", "Math": "数学" }, inplace=True) df = df.drop_duplicates() print(df) df = df.apply(sum, axis=1) print("--------") print(df)
## subset duplicated values df3.duplicated(subset=['A', 'B']) df3.duplicated(['A', 'B']) ## HOW to get all values that have duplicates (purging) t_b = df3.duplicated() b_t = df3.duplicated(take_last=True) unique = ~(t_b | b_t) # complement where either is true unique = ~t_b & ~b_t unique df3[unique] # DROPPING DUPLICATES -------------------------------------------- df3.drop_duplicates() df3.drop_duplicates(take_last=True) ## this is the same as t_b = df3.duplicated() df3[~t_b] df3.drop_duplicates() == df3[~t_b] ## subset criteria df3.drop_duplicates(['A', 'B']) # WHEN TO USE ---------------------------------------------------------- ## if you want to keep the first duplicated value (from top) and remove others df3.drop_duplicates()
import pandas as pd from pandas import DataFrame df = DataFrame({ 'col1':['uber', 'uber', 'uber', 'uber', 'grab'], 'col2':[5,4,3,3,5] }) print df print df.duplicated() #drp duplicate print df.drop_duplicates() #drop column print df.drop_duplicates(['col1']) # print df.drop_duplicates(['col1'],keep='last')
def load_img_metadata(meta_df: pd.DataFrame, imgs_df: pd.DataFrame): imgs_df['id'] = imgs_df['id'].astype(int) imgs_df.drop_duplicates(subset=['cover url'], inplace=True) ans = meta_df.merge(imgs_df, left_on='imdb_id', right_on='id', how='inner') return ans
def IBTrading(app): #SetupLogger() Already setup in the main script app.logger_report.warning("Algorithm checktime: %s", datetime.datetime.now()) time.sleep(ib_api_include.sleeptime_trading) # Get the open position for the account. Through flex query delayed data #openPosition = GetOpenPosition() reqId = 1 # Reteieve the market data and news for the open options. # Get the open position from the portfolio portfolio = app.reqPortfolio() portfSummary = DataFrame(portfolio, columns=[ 'symbol', 'secType', 'exchange', 'currency', 'lastTradeDateOrContractMonth', 'multiplier', 'right', 'strike', 'position', 'marketPrice', 'marketValue', 'averageCost', 'unrealizedPNL', 'realizedPNL', 'accountName' ]) portfSummary = portfSummary.drop_duplicates(subset=[ 'symbol', 'secType', 'exchange', 'currency', 'lastTradeDateOrContractMonth', 'multiplier', 'right', 'strike' ], keep='first') print(portfSummary) reqArraySize = ib_api_include.reqArraySize reqIdList = list(range(1, reqArraySize * len(portfSummary) + 1)) # Not hard coded, replace 4 with REQ_ARRAY_SIZE symList = [] secTypeList = [] app.logger_report.warning("Portfolio:{}".format(portfSummary)) for i in range(0, len(portfSummary)): item = portfSummary.iloc[i] symList += [item['symbol']] * reqArraySize secTypeList += [item['secType']] * reqArraySize reqIdSymList = DataFrame({ "reqId": reqIdList, "symbol": symList, 'secType': secTypeList }) marketReturn = app.GetMarketDataNews(portfSummary, reqId, reqArraySize) # Sheet names, profile_stock_threshold, profile_opt_threshold, watchlist_stock_treshold, watchlist_opt_threshold profile_stock_threshold = pd.read_excel( ib_api_include.thresholdFileName, sheet_name='profile_stock_threshold') option_profit_threshold = pd.read_excel(ib_api_include.thresholdFileName, sheet_name='profile_opt_threshold') watchlist_stock_treshold = pd.read_excel( ib_api_include.thresholdFileName, sheet_name='watchlist_stock_treshold') watchlist_stock_treshold = DataFrame(watchlist_stock_treshold) tickTypeIdMapping = pd.read_csv(ib_api_include.tickTypeIdMapping, index_col=None) tickTypeIdMapping["TickId"] = tickTypeIdMapping["TickId"].astype(int) marketPrice = DataFrame( marketReturn[0], columns=["reqId", "tickTypeId", "price", "executeNow"]) marketSize = DataFrame(marketReturn[1], columns=["reqId", "tickTypeId", "volume"]) optionInfo = DataFrame(marketReturn[2], columns=[ "reqId", "tickTypeId", "optionInfo", "delta", "optPrice", "pvDividend", "gamma", "vega", "theta", "undPrice" ]) #Check if it is live trading if datetime.datetime.now() >= datetime.datetime.strptime(str(datetime.datetime.now().date()) + ' 15:55:00','%Y-%m-%d %H:%M:%S') \ and datetime.datetime.now() < datetime.datetime.strptime(str(datetime.datetime.now().date()) + ' 09:30:00','%Y-%m-%d %H:%M:%S'): print("Trading not alive!") return None news = DataFrame(marketReturn[3], columns=[ "tickerId", "timeStamp", "providerCode", "articleId", "headline", "effect" ]) portfSummary = marketReturn[4] marketPrice = pd.merge(marketPrice, tickTypeIdMapping, left_on="tickTypeId", right_on="TickId", how="left") marketSize = pd.merge(marketSize, tickTypeIdMapping, left_on="tickTypeId", right_on="TickId", how="left") optionInfo = pd.merge(optionInfo, tickTypeIdMapping, left_on="tickTypeId", right_on="TickId", how="left") print("Here is the portfolio") print(portfSummary) marketPrice = pd.merge(marketPrice, reqIdSymList, on="reqId", how="left") marketSize = pd.merge(marketSize, reqIdSymList, on="reqId", how="left") optionInfo = pd.merge(optionInfo, reqIdSymList, on="reqId", how="left") app.logger_report.warning("marketPrice:{}".format(marketPrice)) app.logger_report.warning("marketSize:{}".format(marketSize)) app.logger_report.warning("optionInfo:{}".format(optionInfo)) news = pd.merge(news, reqIdSymList, left_on="tickerId", right_on="reqId", how="left") # Use AI model to build up the model for the option or the stock. my_ib_trading = open(ib_api_include.generatedFileName, 'r') tradingFile = json.load(my_ib_trading) my_ib_trading.close() tradingFile["ready_flag"] = True tradingFile["previous_time"] = str(datetime.datetime.now() - datetime.timedelta(minutes=1))[0:-7] tradingFile["updated_time"] = str(datetime.datetime.now())[0:-7] tradingFile['transactions'] = [] print(portfSummary) commissionFee = ib_api_include.commissionFee for index, item in portfSummary.iterrows(): if item['secType'] == 'STK': Stock_Algo = ib_trading_algo_oop.Stock_Algo( profile_stock_threshold, item, tradingFile) tradingFile = Stock_Algo.tradingStock() for index, item in portfSummary.iterrows(): if item['secType'] == 'OPT': # Get the option information infoIndivOption = app.GetInfoIndivOption(optionInfo, item) print(infoIndivOption) if infoIndivOption != 'Yes': print(infoIndivOption) optPrice = infoIndivOption[0] undPrice = infoIndivOption[1] gamma = infoIndivOption[2] delta = infoIndivOption[3] theta = infoIndivOption[4] avgCost = infoIndivOption[5] multiplier = infoIndivOption[6] Option_Algo = ib_trading_algo_oop.Option_Algo( option_profit_threshold, optPrice, undPrice, gamma, delta, theta, avgCost, multiplier, item, tradingFile, commissionFee) # Merge the option in portfolio with the option price, size and the option Greek values. if item['position'] > 0 and item['right'] == 'P': tradingFile = Option_Algo.optionBuyPutAlgo() elif item['position'] < 0 and item['right'] == 'P': tradingFile = Option_Algo.optionSellPutAlgo() elif item['position'] > 0 and item['right'] == 'C': tradingFile = Option_Algo.optionBuyCallAlgo() elif item['position'] < 0 and item['right'] == 'C': tradingFile = Option_Algo.optionSellCallAlgo() elif infoIndivOption == 'Yes': pass app.logger_report.warning(tradingFile) writeExeFile = open(ib_api_include.generatedFileName, 'w+') writeExeFile.write(json.dumps(tradingFile, indent=4)) writeExeFile.close()
def test_drop_duplicates_NA(): # none df = DataFrame({'A': [None, None, 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], 'D': lrange(8)}) # single column result = df.drop_duplicates('A') expected = df.loc[[0, 2, 3]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') expected = df.loc[[1, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column result = df.drop_duplicates(['A', 'B']) expected = df.loc[[0, 2, 3, 6]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep='last') expected = df.loc[[1, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep=False) expected = df.loc[[6]] tm.assert_frame_equal(result, expected) # nan df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], 'D': lrange(8)}) # single column result = df.drop_duplicates('C') expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.loc[[3, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column result = df.drop_duplicates(['C', 'B']) expected = df.loc[[0, 1, 2, 4]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep='last') expected = df.loc[[1, 3, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep=False) expected = df.loc[[1]] tm.assert_frame_equal(result, expected)
def detect_ts(df, max_anoms=0.10, direction='pos', alpha=0.05, only_last=None, threshold=None, e_value=False, longterm=False, piecewise_median_period=3, plot=False, y_log=False, xlabel='', ylabel='count', title=None, verbose=False, custom_period=None, use_period=True): """ Anomaly Detection Using Seasonal Hybrid ESD Test A technique for detecting anomalies in seasonal univariate time series where the input is a series of <timestamp, value> pairs. Args: x: Time series as a two column data frame where the first column consists of the timestamps and the second column consists of the observations. max_anoms: Maximum number of anomalies that S-H-ESD will detect as a percentage of the data. direction: Directionality of the anomalies to be detected. Options are: ('pos' | 'neg' | 'both'). alpha: The level of statistical significance with which to accept or reject anomalies. only_last: Find and report anomalies only within the last day or hr in the time series. Options: (None | 'day' | 'hr') threshold: Only report positive going anoms above the threshold specified. Options are: (None | 'med_max' | 'p95' | 'p99') e_value: Add an additional column to the anoms output containing the expected value. longterm: Increase anom detection efficacy for time series that are greater than a month. See Details below. piecewise_median_period_weeks: The piecewise median time window as described in Vallis, Hochenbaum, and Kejariwal (2014). Defaults to 2. plot: (Currently unsupported) A flag indicating if a plot with both the time series and the estimated anoms, indicated by circles, should also be returned. y_log: Apply log scaling to the y-axis. This helps with viewing plots that have extremely large positive anomalies relative to the rest of the data. xlabel: X-axis label to be added to the output plot. ylabel: Y-axis label to be added to the output plot. Details 'longterm' This option should be set when the input time series is longer than a month. The option enables the approach described in Vallis, Hochenbaum, and Kejariwal (2014). 'threshold' Filter all negative anomalies and those anomalies whose magnitude is smaller than one of the specified thresholds which include: the median of the daily max values (med_max), the 95th percentile of the daily max values (p95), and the 99th percentile of the daily max values (p99). 'title' Title for the output plot. 'verbose' Enable debug messages The returned value is a dictionary with the following components: anoms: Data frame containing timestamps, values, and optionally expected values. plot: A graphical object if plotting was requested by the user. The plot contains the estimated anomalies annotated on the input time series """ if not isinstance(df, DataFrame): raise ValueError("data must be a single data frame.") else: if len(df.columns) != 2 or not df.iloc[:, 1].map(np.isreal).all(): raise ValueError(("data must be a 2 column data.frame, with the" "first column being a set of timestamps, and " "the second coloumn being numeric values.")) if (not (df.dtypes[0].type is np.datetime64) and not (df.dtypes[0].type is np.int64)): df = format_timestamp(df) if list(df.columns.values) != ["timestamp", "value"]: df.columns = ["timestamp", "value"] # Sanity check all input parameters if max_anoms > 0.49: length = len(df.value) raise ValueError(("max_anoms must be less than 50% of " "the data points (max_anoms =%f data_points =%s).") % (round(max_anoms * length, 0), length)) if not direction in ['pos', 'neg', 'both']: raise ValueError("direction options are: pos | neg | both.") if not (0.01 <= alpha or alpha <= 0.1): if verbose: import warnings warnings.warn(("alpha is the statistical signifigance, " "and is usually between 0.01 and 0.1")) if only_last and not only_last in ['day', 'hr']: raise ValueError("only_last must be either 'day' or 'hr'") if not threshold in [None, 'med_max', 'p95', 'p99']: raise ValueError("threshold options are: None | med_max | p95 | p99") if not isinstance(e_value, bool): raise ValueError("e_value must be a boolean") if not isinstance(longterm, bool): raise ValueError("longterm must be a boolean") if piecewise_median_period < 3: raise ValueError( "piecewise_median_period must be at greater than 2 periods") if not isinstance(plot, bool): raise ValueError("plot must be a boolean") if not isinstance(y_log, bool): raise ValueError("y_log must be a boolean") if not isinstance(xlabel, basestring): raise ValueError("xlabel must be a string") if not isinstance(ylabel, basestring): raise ValueError("ylabel must be a string") if title and not isinstance(title, basestring): raise ValueError("title must be a string") if not title: title = '' else: title = title + " : " gran = get_gran(df) if gran == "day": num_days_per_line = 7 if isinstance(only_last, basestring) and only_last == 'hr': only_last = 'day' else: num_days_per_line = 1 if gran == 'sec': df.timestamp = date_format(df.timestamp, "%Y-%m-%d %H:%M:00") df = format_timestamp(df.groupby('timestamp').aggregate(np.sum)) # if the data is daily, then we need to bump # the period to weekly to get multiple examples gran_period = {'min': 1440, 'hr': 24, 'day': 7} if (custom_period == None): print "auto choose period" period = gran_period.get(gran) else: period = custom_period if not period: raise ValueError( '%s granularity detected. This is currently not supported.' % gran) num_obs = len(df.value) clamp = (1 / float(num_obs)) if max_anoms < clamp: max_anoms = clamp if longterm: if gran == "day": num_obs_in_period = period * piecewise_median_period + 1 # num_days_in_period = 7 * piecewise_median_period + 1 else: num_obs_in_period = period * piecewise_median_period # num_days_in_period = piecewise_median_period # dong nay can phai sua lai neu period ko phai la ngay last_date = df.timestamp.iget(-1) all_data = [] for j in range(0, len(df.timestamp), num_obs_in_period): start_date = df.timestamp.iget(j) end_date = min( start_date + datetime.timedelta(minutes=num_obs_in_period), df.timestamp.iget(-1)) # if there is at least 14 days left, subset it, # otherwise subset last_date - 14days if ((int)((end_date - start_date).total_seconds())) == num_obs_in_period * 60: sub_df = df[(df.timestamp >= start_date) & (df.timestamp < end_date)] else: sub_df = df[(df.timestamp > ( last_date - datetime.timedelta(minutes=num_obs_in_period))) & (df.timestamp <= last_date)] all_data.append(sub_df) else: all_data = [df] all_anoms = DataFrame(columns=['timestamp', 'value']) seasonal_plus_trend = DataFrame(columns=['timestamp', 'value']) # Detect anomalies on all data (either entire data in one-pass, # or in 2 week blocks if longterm=TRUE) for i in range(len(all_data)): directions = { 'pos': Direction(True, True), 'neg': Direction(True, False), 'both': Direction(False, True) } anomaly_direction = directions[direction] # detect_anoms actually performs the anomaly detection and # returns the results in a list containing the anomalies # as well as the decomposed components of the time series # for further analysis. s_h_esd_timestamps = detect_anoms( all_data[i], k=max_anoms, alpha=alpha, num_obs_per_period=period, use_decomp=True, one_tail=anomaly_direction.one_tail, upper_tail=anomaly_direction.upper_tail, verbose=verbose, use_period=use_period) # store decomposed components in local variable and overwrite # s_h_esd_timestamps to contain only the anom timestamps data_decomp = s_h_esd_timestamps['stl'] s_h_esd_timestamps = s_h_esd_timestamps['anoms'] # -- Step 3: Use detected anomaly timestamps to extract the actual # anomalies (timestamp and value) from the data if s_h_esd_timestamps: anoms = all_data[i][all_data[i].timestamp.isin(s_h_esd_timestamps)] else: anoms = DataFrame(columns=['timestamp', 'value']) # Filter the anomalies using one of the thresholding functions if applicable if threshold: # Calculate daily max values periodic_maxes = df.groupby(df.timestamp.map( Timestamp.date)).aggregate(np.max).value # Calculate the threshold set by the user if threshold == 'med_max': thresh = periodic_maxes.median() elif threshold == 'p95': thresh = periodic_maxes.quantile(.95) elif threshold == 'p99': thresh = periodic_maxes.quantile(.99) # Remove any anoms below the threshold anoms = anoms[anoms.value >= thresh] all_anoms = all_anoms.append(anoms) seasonal_plus_trend = seasonal_plus_trend.append(data_decomp) # Cleanup potential duplicates try: all_anoms.drop_duplicates(subset=['timestamp']) seasonal_plus_trend.drop_duplicates(subset=['timestamp']) except TypeError: all_anoms.drop_duplicates(cols=['timestamp']) seasonal_plus_trend.drop_duplicates(cols=['timestamp']) # -- If only_last was set by the user, # create subset of the data that represent the most recent day if only_last: start_date = df.timestamp.iget(-1) - datetime.timedelta(days=7) start_anoms = df.timestamp.iget(-1) - datetime.timedelta(days=1) if gran is "day": breaks = 3 * 12 num_days_per_line = 7 else: if only_last == 'day': breaks = 12 else: start_date = df.timestamp.iget(-1) - datetime.timedelta(days=2) # truncate to days start_date = datetime.date(start_date.year, start_date.month, start_date.day) start_anoms = (df.timestamp.iget(-1) - datetime.timedelta(hours=1)) breaks = 3 # subset the last days worth of data x_subset_single_day = df[df.timestamp > start_anoms] # When plotting anoms for the last day only # we only show the previous weeks data x_subset_week = df[(df.timestamp <= start_anoms) & (df.timestamp > start_date)] if len(all_anoms) > 0: all_anoms = all_anoms[ all_anoms.timestamp >= x_subset_single_day.timestamp.iget(0)] num_obs = len(x_subset_single_day.value) # Calculate number of anomalies as a percentage anom_pct = (len(df.value) / float(num_obs)) * 100 if anom_pct == 0: return {"anoms": None, "plot": None} # The original R implementation handles plotting here. # Plotting is currently not implemented in this version. # if plot: # plot_something() all_anoms.index = all_anoms.timestamp if e_value: d = { 'timestamp': all_anoms.timestamp, 'anoms': all_anoms.value, 'expected_value': seasonal_plus_trend[seasonal_plus_trend.timestamp.isin( all_anoms.timestamp)].value } else: d = {'timestamp': all_anoms.timestamp, 'anoms': all_anoms.value} anoms = DataFrame(d, index=d['timestamp'].index) return {'anoms': anoms, 'plot': None}
import pandas as pd from pandas import DataFrame ReadCsv = pd.read_csv(r'gloss_entryID_synonyms.csv', sep=';', header='infer') df_glosses = DataFrame(ReadCsv) df_glosses.head() new_df_glosses = df_glosses.drop_duplicates() df_glosses.to_csv('gloss_entryID_synonyms.csv', sep=',', index=False)
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def famille(temporary_store=None, year=None): assert temporary_store is not None assert year is not None log.info('step_04_famille: construction de la table famille') # On suit la méthode décrite dans le Guide ERF_2002_rétropolée page 135 # TODO: extraire ces valeurs d'un fichier de paramètres de législation if year == 2006: smic = 1254 elif year == 2007: smic = 1280 elif year == 2008: smic = 1308 elif year == 2009: smic = 1337 else: log.info("smic non défini") # TODO check if we can remove acteu forter etc since dealt with in 01_pre_proc log.info('Etape 1 : préparation de base') log.info(' 1.1 : récupération de indivi') indivi = temporary_store['indivim_{}'.format(year)] indivi['year'] = year indivi["noidec"] = indivi["declar1"].str[0:2].copy( ) # Not converted to int because some NaN are present indivi["agepf"] = ( (indivi.naim < 7) * (indivi.year - indivi.naia) + (indivi.naim >= 7) * (indivi.year - indivi.naia - 1)).astype( object ) # TODO: naia has some NaN but naim do not and then should be an int indivi = indivi[~((indivi.lien == 6) & (indivi.agepf < 16) & (indivi.quelfic == "EE"))].copy() assert_dtype(indivi.year, "int64") for series_name in ['agepf', 'noidec']: # integer with NaN assert_dtype(indivi[series_name], "object") log.info(' 1.2 : récupération des enfants à naître') individual_variables = [ 'acteu', 'actrec', 'agepf', 'agepr', 'cohab', 'contra', 'declar1', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noidec', 'noimer', 'noindiv', 'noiper', 'persfip', 'quelfic', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', 'year', 'ztsai', ] enfants_a_naitre = temporary_store['enfants_a_naitre_{}'.format( year)][individual_variables].copy() enfants_a_naitre.drop_duplicates('noindiv', inplace=True) log.info(u"""" Il y a {} enfants à naitre avant de retirer ceux qui ne sont pas enfants de la personne de référence """.format(len(enfants_a_naitre.index))) enfants_a_naitre = enfants_a_naitre[enfants_a_naitre.lpr == 3].copy() enfants_a_naitre = enfants_a_naitre[~( enfants_a_naitre.noindiv.isin(indivi.noindiv.values))].copy() log.info(u"""" Il y a {} enfants à naitre après avoir retiré ceux qui ne sont pas enfants de la personne de référence """.format(len(enfants_a_naitre.index))) # PB with vars "agepf" "noidec" "year" NOTE: quels problèmes ? JS log.info(u" 1.3 : création de la base complète") base = concat([indivi, enfants_a_naitre]) log.info(u"base contient {} lignes ".format(len(base.index))) base['noindiv'] = (100 * base.ident + base['noi']).astype(int) base['moins_de_15_ans_inclus'] = base.agepf < 16 base['jeune_non_eligible_rsa'] = (base.agepf >= 16) & (base.agepf < AGE_RSA) base['jeune_eligible_rsa'] = base.agepf >= AGE_RSA base['ztsai'].fillna(0, inplace=True) base['smic55'] = base['ztsai'] >= (smic * 12 * 0.55 ) # 55% du smic mensuel brut base['famille'] = 0 base['kid'] = False for series_name in [ 'kid', 'moins_de_15_ans_inclus', 'jeune_non_eligible_rsa', 'jeune_eligible_rsa', 'smic55' ]: assert_dtype(base[series_name], "bool") assert_dtype(base.famille, "int") # TODO: remove or clean from NA assert_dtype(base.ztsai, "int") log.info(u"Etape 2 : On cherche les enfants ayant père et/ou mère") personne_de_reference = base[['ident', 'noi']][base.lpr == 1].copy() personne_de_reference['noifam'] = ( 100 * personne_de_reference.ident + personne_de_reference['noi']).astype(int) personne_de_reference = personne_de_reference[['ident', 'noifam']].copy() log.info(u"length personne_de_reference : {}".format( len(personne_de_reference.index))) nof01 = base[(base.lpr.isin([1, 2])) | ((base.lpr == 3) & (base.moins_de_15_ans_inclus)) | ((base.lpr == 3) & (base.jeune_non_eligible_rsa) & (~base.smic55))].copy() log.info('longueur de nof01 avant merge : {}'.format(len(nof01.index))) nof01 = nof01.merge(personne_de_reference, on='ident', how='outer') nof01['famille'] = 10 nof01['kid'] = ((nof01.lpr == 3) & (nof01.moins_de_15_ans_inclus)) | ( (nof01.lpr == 3) & (nof01.jeune_non_eligible_rsa) & ~(nof01.smic55)) for series_name in ['famille', 'noifam']: assert_dtype(nof01[series_name], "int") assert_dtype(nof01.kid, "bool") famille = nof01.copy() del nof01 control_04(famille, base) log.info(u" 2.1 : identification des couples") # l'ID est le noi de l'homme hcouple = subset_base(base, famille) hcouple = hcouple[(hcouple.cohab == 1) & (hcouple.lpr >= 3) & (hcouple.sexe == 1)].copy() hcouple['noifam'] = (100 * hcouple.ident + hcouple.noi).astype(int) hcouple['famille'] = 21 for series_name in ['famille', 'noifam']: assert_dtype(hcouple[series_name], "int") log.info(u"longueur hcouple : ".format(len(hcouple.index))) log.info(u" 2.2 : attributing the noifam to the wives") fcouple = base[~(base.noindiv.isin(famille.noindiv.values))].copy() fcouple = fcouple[(fcouple.cohab == 1) & (fcouple.lpr >= 3) & (fcouple.sexe == 2)].copy() # l'identifiant de la famille est celui du conjoint de la personne de référence du ménage fcouple['noifam'] = (100 * fcouple.ident + fcouple.noicon).astype(int) fcouple['famille'] = 22 for series_name in ['famille', 'noifam']: assert_dtype(fcouple[series_name], "int") log.info(u"Il y a {} enfants avec parents en fcouple".format( len(fcouple.index))) famcom = fcouple.merge(hcouple, on='noifam', how='outer') log.info(u"longueur fancom après fusion : {}".format(len(famcom.index))) fcouple = fcouple.merge( famcom ) # TODO : check s'il ne faut pas faire un inner merge sinon présence de doublons log.info(u"longueur fcouple après fusion : {}".format(len(fcouple.index))) famille = concat([famille, hcouple, fcouple], join='inner') control_04(famille, base) log.info(u"Etape 3: Récupération des personnes seules") log.info(u" 3.1 : personnes seules de catégorie 1") seul1 = base[~(base.noindiv.isin(famille.noindiv.values))].copy() seul1 = seul1[(seul1.lpr.isin([3, 4])) & ((seul1.jeune_non_eligible_rsa & seul1.smic55) | seul1.jeune_eligible_rsa) & (seul1.cohab == 1) & (seul1.sexe == 2)].copy() if len(seul1.index) > 0: seul1['noifam'] = (100 * seul1.ident + seul1.noi).astype(int) seul1['famille'] = 31 for series_name in ['famille', 'noifam']: assert_dtype(seul1[series_name], "int") famille = concat([famille, seul1]) control_04(famille, base) log.info(u" 3.1 personnes seules de catégorie 2") seul2 = base[~(base.noindiv.isin(famille.noindiv.values))].copy() seul2 = seul2[(seul2.lpr.isin([3, 4])) & seul2.jeune_non_eligible_rsa & seul2.smic55 & (seul2.cohab != 1)].copy() seul2['noifam'] = (100 * seul2.ident + seul2.noi).astype(int) seul2['famille'] = 32 for series_name in ['famille', 'noifam']: assert_dtype(seul2[series_name], "int") famille = concat([famille, seul2]) control_04(famille, base) log.info(u" 3.3 personnes seules de catégorie 3") seul3 = subset_base(base, famille) seul3 = seul3[(seul3.lpr.isin([3, 4])) & seul3.jeune_eligible_rsa & (seul3.cohab != 1)].copy() # TODO: CHECK erreur dans le guide méthodologique ERF 2002 lpr 3,4 au lieu de 3 seulement seul3['noifam'] = (100 * seul3.ident + seul3.noi).astype(int) seul3['famille'] = 33 for series_name in ['famille', 'noifam']: assert_dtype(seul3[series_name], "int") famille = concat([famille, seul3]) control_04(famille, base) log.info(u" 3.4 : personnes seules de catégorie 4") seul4 = subset_base(base, famille) assert seul4.noimer.notnull().all() seul4 = seul4[(seul4.lpr == 4) & seul4.jeune_non_eligible_rsa & ~(seul4.smic55) & (seul4.noimer == 0) & (seul4.persfip == 'vous')].copy() if len(seul4.index) > 0: seul4['noifam'] = (100 * seul4.ident + seul4.noi).astype(int) seul4['famille'] = 34 famille = concat([famille, seul4]) for series_name in ['famille', 'noifam']: assert_dtype(seul4[series_name], "int") control_04(famille, base) log.info(u"Etape 4 : traitement des enfants") log.info(u" 4.1 : enfant avec mère") avec_mere = subset_base(base, famille) avec_mere = avec_mere[((avec_mere.lpr == 4) & ((avec_mere.jeune_non_eligible_rsa == 1) | (avec_mere.moins_de_15_ans_inclus == 1)) & (avec_mere.noimer > 0))].copy() avec_mere['noifam'] = (100 * avec_mere.ident + avec_mere.noimer).astype(int) avec_mere['famille'] = 41 avec_mere['kid'] = True for series_name in ['famille', 'noifam']: assert_dtype(avec_mere[series_name], "int") assert_dtype(avec_mere.kid, "bool") # On récupère les mères des enfants mereid = DataFrame(avec_mere['noifam'].copy( )) # Keep a DataFrame instead of a Series to deal with rename and merge # Ces mères peuvent avoir plusieurs enfants, or il faut unicité de l'identifiant mereid.rename(columns={'noifam': 'noindiv'}, inplace=True) mereid.drop_duplicates(inplace=True) mere = mereid.merge(base) mere['noifam'] = (100 * mere.ident + mere.noi).astype(int) mere['famille'] = 42 for series_name in ['famille', 'noifam']: assert_dtype(mere[series_name], "int") avec_mere = avec_mere[avec_mere.noifam.isin(mereid.noindiv.values)].copy() log.info(u"Contrôle de famille après ajout des pères") control_04(mere, base) famille = famille[~(famille.noindiv.isin(mere.noindiv.values))].copy() control_04(famille, base) # on retrouve les conjoints des mères assert mere.noicon.notnull().all() conj_mereid = mere.loc[mere.noicon > 0, ['ident', 'noicon', 'noifam']].copy() conj_mereid['noindiv'] = 100 * conj_mereid.ident + conj_mereid.noicon assert_dtype(conj_mereid[series_name], "int") conj_mereid = conj_mereid[['noindiv', 'noifam']].copy() conj_mereid = conj_mereid.merge(base) control_04(conj_mereid, base) conj_mere = conj_mereid.merge(base) conj_mere['famille'] = 43 for series_name in ['famille', 'noifam']: assert_dtype(conj_mereid[series_name], "int") famille = famille[~(famille.noindiv.isin(conj_mere.noindiv.values))].copy() famille = concat([famille, avec_mere, mere, conj_mere]) control_04(famille, base) del avec_mere, mere, conj_mere, mereid, conj_mereid log.info(u" 4.2 : enfants avec père") avec_pere = subset_base(base, famille) assert avec_pere.noiper.notnull().all() avec_pere = avec_pere[(avec_pere.lpr == 4) & ((avec_pere.jeune_non_eligible_rsa == 1) | (avec_pere.moins_de_15_ans_inclus == 1)) & (avec_pere.noiper > 0)] avec_pere['noifam'] = (100 * avec_pere.ident + avec_pere.noiper).astype(int) avec_pere['famille'] = 44 avec_pere['kid'] = True # TODO: hack to deal with the problem of presence of NaN in avec_pere # avec_pere.dropna(subset = ['noifam'], how = 'all', inplace = True) assert avec_pere['noifam'].notnull().all(), 'presence of NaN in avec_pere' for series_name in ['famille', 'noifam']: assert_dtype(avec_pere[series_name], "int") assert_dtype(avec_pere.kid, "bool") pereid = DataFrame( avec_pere['noifam'] ) # Keep a DataFrame instead of a Series to deal with rename and merge pereid.rename(columns={'noifam': 'noindiv'}, inplace=True) pereid.drop_duplicates(inplace=True) pere = pereid.merge(base) pere['noifam'] = (100 * pere.ident + pere.noi).astype(int) pere['famille'] = 45 famille = famille[~(famille.noindiv.isin(pere.noindiv.values))].copy() # On récupère les conjoints des pères assert pere.noicon.notnull().all() conj_pereid = pere.loc[pere.noicon > 0, ['ident', 'noicon', 'noifam']].copy() conj_pereid['noindiv'] = (100 * conj_pereid.ident + conj_pereid.noicon).astype(int) conj_pereid = conj_pereid[['noindiv', 'noifam']].copy() conj_pere = conj_pereid.merge(base) control_04(conj_pere, base) if len(conj_pere.index) > 0: conj_pere['famille'] = 46 for series_name in ['famille', 'noifam']: assert_dtype(conj_pere[series_name], "int") famille = famille[~(famille.noindiv.isin(conj_pere.noindiv.values))].copy() famille = concat([famille, avec_pere, pere, conj_pere]) log.info(u"Contrôle de famille après ajout des pères") control_04(famille, base) del avec_pere, pere, pereid, conj_pere, conj_pereid log.info(u" 4.3 : enfants avec déclarant") avec_dec = subset_base(base, famille) avec_dec = avec_dec[(avec_dec.persfip == "pac") & (avec_dec.lpr == 4) & ( (avec_dec.jeune_non_eligible_rsa & ~(avec_dec.smic55)) | (avec_dec.moins_de_15_ans_inclus == 1))] avec_dec['noifam'] = (100 * avec_dec.ident + avec_dec.noidec.astype('int')).astype('int') avec_dec['famille'] = 47 avec_dec['kid'] = True for series_name in ['famille', 'noifam']: assert_dtype(avec_dec[series_name], "int") assert_dtype(avec_dec.kid, "bool") control_04(avec_dec, base) # on récupère les déclarants pour leur attribuer une famille propre declarant_id = DataFrame( avec_dec['noifam'].copy()).rename(columns={'noifam': 'noindiv'}) declarant_id.drop_duplicates(inplace=True) dec = declarant_id.merge(base) dec['noifam'] = (100 * dec.ident + dec.noi).astype(int) dec['famille'] = 48 for series_name in ['famille', 'noifam']: assert_dtype(dec[series_name], "int") famille = famille[~(famille.noindiv.isin(dec.noindiv.values))].copy() famille = concat([famille, avec_dec, dec]) del dec, declarant_id, avec_dec control_04(famille, base) log.info(u"Etape 5 : Récupération des enfants fip") log.info(u" 5.1 : Création de la df fip") individual_variables_fip = [ 'acteu', 'actrec', 'agepf', 'agepr', 'cohab', 'contra', 'declar1', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noidec', 'noimer', 'noindiv', 'noiper', 'persfip', 'quelfic', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', 'year', 'ztsai', ] fip = temporary_store['fipDat_{}'.format( year)][individual_variables_fip].copy() # Variables auxilaires présentes dans base qu'il faut rajouter aux fip' # WARNING les noindiv des fip sont construits sur les ident des déclarants # pas d'orvelap possible avec les autres noindiv car on a des noi =99, 98, 97 ,...' fip['moins_de_15_ans_inclus'] = (fip.agepf < 16) fip['jeune_non_eligible_rsa'] = ((fip.agepf >= 16) & (fip.agepf <= 20)) fip['jeune_eligible_rsa'] = (fip.agepf >= 21) fip['smic55'] = (fip.ztsai >= smic * 12 * 0.55) fip['famille'] = 0 fip['kid'] = False for series_name in [ 'kid', 'moins_de_15_ans_inclus', 'jeune_non_eligible_rsa', 'jeune_eligible_rsa', 'smic55' ]: assert_dtype(fip[series_name], "bool") for series_name in ['famille']: assert_dtype(fip[series_name], "int") # # base <- rbind(base,fip) # # table(base$quelfic) # # enfant_fip <- base[(!base$noindiv %in% famille$noindiv),] # # enfant_fip <- subset(enfant_fip, (quelfic=="FIP") & (( (agepf %in% c(19,20)) & !smic55 ) | (naia==year & rga=='6')) ) # TODO check year ou year-1 ! # # enfant_fip <- within(enfant_fip,{ # # noifam=100*ident+noidec # # famille=50 # # kid=TRUE}) # # # ident=NA}) # TODO : je ne sais pas quoi mettre un NA fausse les manips suivantes # # famille <- rbind(famille,enfant_fip) # # # # # TODO: En 2006 on peut faire ce qui suit car tous les parents fip sont déjà dans une famille # # parent_fip <- famille[famille$noindiv %in% enfant_fip$noifam,] # # any(enfant_fip$noifam %in% parent_fip$noindiv) # # parent_fip <- within(parent_fip,{ # # noifam <- noindiv # # famille <- 51 # # kid <- FALSE}) # # famille[famille$noindiv %in% enfant_fip$noifam,] <- parent_fip # # # TODO quid du conjoint ? log.info(u" 5.2 : extension de base avec les fip") base_ = concat([base, fip]) enfant_fip = subset_base(base_, famille) enfant_fip = enfant_fip[(enfant_fip.quelfic == "FIP") & ( (enfant_fip.agepf.isin([19, 20]) & ~(enfant_fip.smic55)) | ( (enfant_fip.naia == enfant_fip.year - 1) & (enfant_fip.rga.astype('int') == 6)))].copy() enfant_fip['noifam'] = (100 * enfant_fip.ident + enfant_fip.noidec).astype(int) enfant_fip['famille'] = 50 enfant_fip['kid'] = True enfant_fip['ident'] = None # TODO: should we really do this ? assert_dtype(enfant_fip.kid, "bool") for series_name in ['famille', 'noifam']: assert_dtype(enfant_fip[series_name], "int") control_04(enfant_fip, base) famille = concat([famille, enfant_fip]) base = concat([base, enfant_fip]) parent_fip = famille[famille.noindiv.isin(enfant_fip.noifam.values)].copy() assert (enfant_fip.noifam.isin(parent_fip.noindiv.values)).any(), \ "{} doublons entre enfant_fip et parent fip !".format((enfant_fip.noifam.isin(parent_fip.noindiv.values)).sum()) parent_fip['noifam'] = parent_fip['noindiv'].values.copy() parent_fip['famille'] = 51 parent_fip['kid'] = False log.info(u"Contrôle de parent_fip") control_04(parent_fip, base) control_04(famille, base) famille = famille.merge(parent_fip, how='outer') del enfant_fip, fip, parent_fip gc.collect() # duplicated_individuals = famille.noindiv.duplicated() # TODO: How to prevent failing in the next assert and avoiding droppping duplicates ? # assert not duplicated_individuals.any(), "{} duplicated individuals in famille".format( # duplicated_individuals.sum()) famille = famille.drop_duplicates(subset='noindiv', take_last=True) control_04(famille, base) # # message('Etape 6 : non attribué') # # non_attribue1 <- base[(!base$noindiv %in% famille$noindiv),] # # non_attribue1 <- subset(non_attribue1, # # (quelfic!="FIP") & (moins_de_15_ans_inclus | (jeune_non_eligible_rsa&(lien %in% c(1,2,3,4) & agepr>=35))) # # ) # # # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice) # # non_attribue1 <- merge(pr,non_attribue1) # # non_attribue1 <- within(non_attribue1,{ # # famille <- ifelse(moins_de_15_ans_inclus,61,62) # # kid <- TRUE }) # # # # rm(pr) # # famille <- rbind(famille,non_attribue1) # # dup <- duplicated(famille$noindiv) # # table(dup) # # rm(non_attribue1) # # table(famille$famille, useNA="ifany") # # # # non_attribue2 <- base[(!base$noindiv %in% famille$noindiv) & (base$quelfic!="FIP"),] # # non_attribue2 <- within(non_attribue2,{ # # noifam <- 100*ident+noi # l'identifiant est celui du jeune */ # # kid<-FALSE # # famille<-63}) # # # # famille <- rbind(famille,non_attribue2) log.info(u"Etape 6 : gestion des non attribués") log.info(u" 6.1 : non attribués type 1") non_attribue1 = subset_base(base, famille) non_attribue1 = non_attribue1[~(non_attribue1.quelfic != 'FIP') & (non_attribue1.moins_de_15_ans_inclus | (non_attribue1.jeune_non_eligible_rsa & (non_attribue1.lien.isin(range(1, 5))) & (non_attribue1.agepr >= 35)))].copy() # On rattache les moins de 15 ans avec la PR (on a déjà éliminé les enfants en nourrice) non_attribue1 = non_attribue1.merge(personne_de_reference) control_04(non_attribue1, base) non_attribue1[ 'famille'] = 61 * non_attribue1.moins_de_15_ans_inclus + 62 * ~( non_attribue1.moins_de_15_ans_inclus) non_attribue1['kid'] = True assert_dtype(non_attribue1.kid, "bool") assert_dtype(non_attribue1.famille, "int") famille = concat([famille, non_attribue1]) control_04(famille, base) del personne_de_reference, non_attribue1 log.info(u" 6.2 : non attribué type 2") non_attribue2 = base[(~(base.noindiv.isin(famille.noindiv.values)) & (base.quelfic != "FIP"))].copy() non_attribue2['noifam'] = (100 * non_attribue2.ident + non_attribue2.noi).astype(int) non_attribue2['kid'] = False non_attribue2['famille'] = 63 assert_dtype(non_attribue2.kid, "bool") for series_name in ['famille', 'noifam']: assert_dtype(non_attribue2[series_name], "int") famille = concat([famille, non_attribue2], join='inner') control_04(famille, base) del non_attribue2 # Sauvegarde de la table famille log.info(u"Etape 7 : Sauvegarde de la table famille") log.info(u" 7.1 : Mise en forme finale") # TODO: nettoyer les champs qui ne servent plus à rien # famille['idec'] = famille['declar1'].str[3:11] # famille['idec'].apply(lambda x: str(x)+'-') # famille['idec'] += famille['declar1'].str[0:2] famille['chef'] = (famille['noifam'] == (100 * famille.ident + famille.noi)) assert_dtype(famille.chef, "bool") famille.reset_index(inplace=True) control_04(famille, base) log.info(u" 7.2 : création de la colonne rang") famille['rang'] = famille.kid.astype('int') while any(famille[(famille.rang != 0)].duplicated( subset=['rang', 'noifam'])): famille.rang.loc[famille.rang != 0] += famille[famille.rang != 0].copy( ).duplicated(subset=["rang", 'noifam']).values log.info(u"nb de rangs différents : {}".format( len(set(famille.rang.values)))) log.info(u" 7.3 : création de la colonne quifam et troncature") log.info(u"value_counts chef : \n {}".format( famille['chef'].value_counts())) log.info(u"value_counts kid :' \n {}".format( famille['kid'].value_counts())) famille['quifam'] = -1 # famille['quifam'] = famille['quifam'].where(famille['chef'].values, 0) # ATTENTTION : ^ stands for XOR famille.quifam = (0 + ((~famille['chef']) & (~famille['kid'])).astype(int) + famille.kid * famille.rang).astype('int') # TODO: Test a groupby to improve the following this (should be placed ) # assert famille['chef'].sum() == len(famille.noifam.unique()), \ # 'The number of family chiefs {} is different from the number of families {}'.format( # famille['chef'].sum(), # len(famille.idfam.unique()) # ) # famille['noifam'] = famille['noifam'].astype('int') log.info(u"value_counts quifam : \n {}".format( famille['quifam'].value_counts())) famille = famille[['noindiv', 'quifam', 'noifam']].copy() famille.rename(columns={'noifam': 'idfam'}, inplace=True) log.info(u"Vérifications sur famille") # TODO: we drop duplicates if any duplicated_famillle_count = famille.duplicated( subset=['idfam', 'quifam']).sum() if duplicated_famillle_count > 0: log.info( u"There are {} duplicates of quifam inside famille, we drop them". format()) famille.drop_duplicates(subset=['idfam', 'quifam'], inplace=True) # assert not(famille.duplicated(subset = ['idfam', 'quifam']).any()), \ # 'There are {} duplicates of quifam inside famille'.format( # famille.duplicated(subset = ['idfam', 'quifam']).sum()) temporary_store["famc_{}".format(year)] = famille del indivi, enfants_a_naitre
def label_processing(self, user_df: pd.DataFrame, item_df: pd.DataFrame, user_his_df: pd.DataFrame, label_df: pd.DataFrame, u_id_col: str, i_id_col: str, y_label: str, start_index=0, end_index=-1, fill_na=0, **kwargs): """ 对原始数据进行预处理,输入为pandas dataframe,输出为直接入模的numpy array Args: user_df: 静态数据的Dataframe item_df: 时序数据的Dataframe user_his_df: 时序数据的Dataframe label_df: Y标签的Dataframe training为True时起作用 u_id_col: user_id的列名 i_id_col: item_id的列名 start_index: 本次批量的开始位置 end_index: 本次批量的结束位置 y_label: Y标签列名 training为True时起作用 fill_na: 缺失值填充 """ if type(user_df) != pd.DataFrame or type(item_df) != pd.DataFrame or type(user_his_df) != pd.DataFrame\ or type(label_df) != pd.DataFrame: raise ValueError( "Error: Input X data must be Pandas.DataFrame format.\n输入数据必须是Pandas DataFrame格式!" ) # 填充缺失值 user_df = user_df.fillna(fill_na) item_df = item_df.fillna(fill_na) user_his_df = user_his_df.fillna(fill_na) # 先对离散字段做编码 for col in self.i_discrete_cols: le = self.le_dict[col] tmp = item_df[col].map(lambda s: fill_na if s not in le.classes_ else s) item_df[col + str('_transformed')] = le.transform(tmp) if end_index == -1: end_index = len(user_df) u_ids = user_df.iloc[start_index:end_index][[u_id_col]] # 取出本次batch的数据 user_batch_df = user_df[user_df[u_id_col].isin(u_ids[u_id_col])] # 笛卡尔积之前先做编码 for col in self.u_discrete_cols: le = self.le_dict[col] tmp = user_batch_df[col].map(lambda s: fill_na if s not in le.classes_ else s) user_batch_df[col + str('_transformed')] = le.transform(tmp) user_batch_his_df = user_his_df[user_his_df[u_id_col].isin( u_ids[u_id_col])] # 先对离散字段做编码 for col in self.u_history_cols: item_col_name = self.u_history_col_names[self.u_history_cols.index( col)] le = self.le_dict[item_col_name] tmp = user_batch_his_df[col].map(lambda s: fill_na if s not in le.classes_ else s) user_batch_his_df[col + str('_transformed')] = le.transform(tmp) # 获取用户侧输入特征 label_df = label_df.drop_duplicates([u_id_col, i_id_col], keep='first') # 防止出现脏数据 print("label_df长度:" + str(len(label_df))) print("用户长度:" + str(len(user_batch_df))) basic_df = pd.merge(user_batch_df, label_df, how='inner', on=[u_id_col]) print("和用户拼接后长度:" + str(len(basic_df))) basic_df = pd.merge(basic_df, item_df, how='inner', on=[i_id_col]) print("和图片拼接后长度:" + str(len(basic_df))) basic_df['id'] = basic_df[u_id_col] + "@" + basic_df[i_id_col] user_batch_his_df = pd.merge(basic_df[['id', u_id_col]], user_batch_his_df, how='left', on=[u_id_col]) user_batch_his_df = user_batch_his_df.fillna(fill_na) # 对没有任何历史记录的用户填充 print("数据预处理完成") # 拼接输入特征 X_input = [ self.scalar_dict['scalar_user'].transform( basic_df[self.u_continue_cols]) ] for col in self.u_discrete_cols: X_input.append(basic_df[col + str('_transformed')]) print("用户基础特征处理完成") for col in self.u_history_cols: constant_fill_na = dict(zip(le.classes_, le.transform( le.classes_)))[fill_na] # 取出fill_na在编码后的值 user_batch_col_tmp = user_batch_his_df[['id', col+str('_transformed')]].groupby(['id'])\ .apply(lambda x: np.pad(x[col+str('_transformed')].values, (self.u_history_col_ts_step[col]-len(x[col+str('_transformed')].values), 0),'constant', constant_values=constant_fill_na) if len(x[col+str('_transformed')].values) < self.u_history_col_ts_step[col] else x[col+str('_transformed')].values) user_batch_col_tmp = np.stack(user_batch_col_tmp.values) X_input.append(user_batch_col_tmp) print("用户历史记录特征处理完成") # 拼接物品端输入特征 X_input.append(self.scalar_dict['scalar_item'].transform( basic_df[self.i_continue_cols])) for col in self.i_discrete_cols: X_input.append(basic_df[col + str('_transformed')]) print("物品特征处理完成") return X_input, basic_df[y_label].values, basic_df['id'].values
# -*- coding: utf-8 -* import pandas as pd import numpy as np from pandas import Series, DataFrame data = { 'chinese': [66, 95, 95, 90, 80, 80], 'english': [65, 85, 92, 88, 90, 90], 'math': [np.nan, 98, 96, 77, 90, 90] } df2 = DataFrame(data, index=[ 'zhangfei', 'guanyu', 'zhaoyun', 'huangzhong', 'dianwei', 'dianwei' ], columns=['chinese', 'english', 'math']) df2.drop_duplicates(inplace=True) print(df2) df2.rename(columns={ 'chinese': '语文', 'english': '英语', 'math': '数学' }, index={ 'zhangfei': '张飞', 'guanyu': '关羽', 'zhaoyun': '赵云', 'huangzhong': '黄忠', 'dianwei': '典韦' }, inplace=True) df2.isnull()