def handle_community(self, community, **options): #from mpl_toolkits.mplot3d import Axes3D canvas = pyplot.figure().gca(projection='3d') clothing_vectors = numpy.array([ cast_elements_to_floats(individual["vectors"]) for individual in community.kernel.individual_set.all() ]) centroids, labels = kmeans2(clothing_vectors, 10, minit="points") clothing_frame = DataFrame() clothing_by_cluster = sorted(zip(labels, clothing_vectors), key=itemgetter(0)) current_label = None for label, vector in clothing_by_cluster: if label != current_label: current_label = label clothing_frame = clothing_frame.append(Series(data=centroids[current_label]), ignore_index=True) clothing_frame = clothing_frame.append(Series(data=vector), ignore_index=True) #centroids_frame = DataFrame(centroids) #centroids_frame.T.plot() #centroids_frame.drop(range(20, 4096), axis=1, inplace=True) #print(centroids_frame.head()) self.plot_data(canvas, clothing_frame, 'b') pyplot.show()
def getIndexChangeRate(startDate,endDate): df_result = DataFrame() df = ts.get_hist_data('sh',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'sh' df_result = df_result.append(df) df = ts.get_hist_data('sz',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'sz' df_result = df_result.append(df) df = ts.get_hist_data('zxb',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'zxb' df_result = df_result.append(df) df = ts.get_hist_data('cyb',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'cyb' df_result = df_result.append(df) fileName = r'D:\stock\index_changeRate_' +startDate+'_' + endDate + '.csv' df_result = df_result.loc[:,['date','mkt','close','volume','price_change','p_change','gap','gap_rate']] df_result = df_result.sort_index(by='date',ascending=False) df_result.to_csv(fileName,index = False)
class Record(object): def __init__(self): self.trade_history = DataFrame() self.position_history = DataFrame() self.portfolio_value_history = DataFrame() def update_trade(self, date, trade_type, symbol, amount, price): newtrade = DataFrame( {"Date": [date], "Trade_type": [trade_type], "Symbol": [symbol], "Amount": [amount], "Price": [price]} ) self.trade_history = self.trade_history.append(newtrade, ignore_index=True) def update_position(self, date, p): newposition = DataFrame( { "Date": [date], "Symbol": [p.symbol], "Amount": [p.amount], "Avg_price": [p.avg_price], "Position_value": [p.position_value], } ) self.position_history = self.position_history.append(newposition, ignore_index=True) def update_portfolio_value(self, date, port, pos, cash): newport = DataFrame({"Date": [date], "Portfolio_value": [port], "Position_value": [pos], "Cash": [cash]}) self.portfolio_value_history = self.portfolio_value_history.append(newport, ignore_index=True)
def scrab_one_user(self,uid,num): ##登陆 weiboLogin = WeiboLogin.WeiboLogin(self.username, self.pwd,self.header) weiboLogin.Login() ##开始获取页面 WBmsg = GetWeiboPage.getWeiboPage() WBmsg.body['uid'] = uid ##构造微博数据存储结构 wb_detail = [] wb_all = {} wb_all = wb_all.fromkeys(wb_detail, []) wb_frame = DataFrame(wb_all, index=[]) for n in range(1, num): # 生成页面url地址 url = 'http://weibo.com/' + uid + '?is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=' + str(n) #print WBmsg.get_firstpage(url,n) all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content(WBmsg.get_firstpage(url,n))) wb_frame=wb_frame.append(all_weibo, ignore_index=True) all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content( WBmsg.get_secondpage(url,n))) wb_frame=wb_frame.append(all_weibo, ignore_index=True) all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content(WBmsg.get_thirdpage(url,n))) wb_frame=wb_frame.append(all_weibo, ignore_index=True) print n return wb_frame
class Append(object): goal_time = 0.2 def setup(self): self.df1 = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() self.mdf1['obj1'] = 'bar' self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 try: with warnings.catch_warnings(record=True): self.mdf1.consolidate(inplace=True) except: pass self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index def time_append_homogenous(self): self.df1.append(self.df2) def time_append_mixed(self): self.mdf1.append(self.mdf2)
def test_append_empty_dataframe(self): # Empty df append empty df df1 = DataFrame([]) df2 = DataFrame([]) result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Non-empty df append empty df df1 = DataFrame(np.random.randn(5, 2)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Empty df with columns append empty df df1 = DataFrame(columns=['bar', 'foo']) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Non-Empty df with columns append empty df df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo']) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected)
def get_endpoint_timeframe(self): result_df = DataFrame() res_len = self._make_req(self.chunk_start, self.chunk_end) self._wait_for_rate_limit() # check to see if there are possibly more results to get if close to max_result # this will make additional requests until either the results are smaller than 9k or the timeframe is 1day if res_len / self.max_results > .90: delta = self.chunk_end - self.chunk_start step_size = math.floor(delta.days / 2) self.chunk_end = self.chunk_start + timedelta(days=step_size) # if step is greater than a day make request if self.chunk_start != self.chunk_end: self.get_endpoint_timeframe() # if no step save data and just increment another day else: self.chunk_start = self.chunk_end + timedelta(days=1) self.chunk_end = self.chunk_end + timedelta(days=1) self.get_endpoint_timeframe() # parse & append results to dataframe df = self._parse_json() result_df = result_df.append(df) # pick up where we left off from chunking elif self.chunk_end != self.end_dt: self.chunk_start = self.chunk_end self.chunk_end = self.end_dt self.get_endpoint_timeframe() # parse & append results to dataframe df = self._parse_json() result_df = result_df.append(df) return result_df
def get_sex_type(): file_name = 'data/info_train.csv' y = pd.read_csv(file_name,header=None,index_col=0) male_id = y[y[1]<7].index m = DataFrame([0]*male_id.size,index=male_id,columns=['sex']) female_id = y[y[1]>6].index f = DataFrame([1]*female_id.size,index=female_id,columns=['sex']) m.append(f).to_csv('data/train_sex.csv')
def get_topwords(self, countries, thresh=10, tf_idf=False): tw = DataFrame() for r in range(len(self.df)): if self.df.loc[r, 'country_id'] in countries: if tf_idf: tw = tw.append(self.tf_idf.loc[r, :]) else: tw = tw.append(self.df.loc[r, :]) return tw.mean().order(ascending=False)[:thresh]
def slide_21(): import json db = json.load(open(FOODJSONPATH)) print len(db) print db[0].keys() print db[0]['nutrients'][0] nutrients = DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] print "今から全部のnutrientsを扱うよ" nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print "なんか重複多い" print nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() print "infoとnutrients両方にdescriptionとgroupがあるから変えよう" col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) ndata = pd.merge(nutrients, info, on='id', how='outer') print ndata.ix[30000] result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh') plt.show() by_nutrient = ndata.groupby(['nutgroup', 'nutrient']) get_maximum = lambda x: x.xs(x.value.idxmax()) get_minimum = lambda x: x.xs(x.value.idxmin()) max_foods = by_nutrient.apply(get_maximum)[['value', 'food']] max_foods.food = max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def test_append(self): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] begin_frame = self.frame.reindex(begin_index) end_frame = self.frame.reindex(end_index) appended = begin_frame.append(end_frame) assert_almost_equal(appended['A'], self.frame['A']) del end_frame['A'] partial_appended = begin_frame.append(end_frame) self.assertIn('A', partial_appended) partial_appended = end_frame.append(begin_frame) self.assertIn('A', partial_appended) # mixed type handling appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) assert_frame_equal(appended, self.mixed_frame) # what to test here mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) # all equal except 'foo' column assert_frame_equal( mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) # append empty empty = DataFrame({}) appended = self.frame.append(empty) assert_frame_equal(self.frame, appended) self.assertIsNot(appended, self.frame) appended = empty.append(self.frame) assert_frame_equal(self.frame, appended) self.assertIsNot(appended, self.frame) # overlap self.assertRaises(ValueError, self.frame.append, self.frame, verify_integrity=True) # new columns # GH 6129 df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) result = df.append(row) assert_frame_equal(result, expected)
def concatenate_years_data(): '''This function combines all of the dataframes for each year into one dataframe consisting of the births data from all of the years.''' years = np.array(range(1880,2011)) # These are all of the years for which we have data current_directory=os.getcwd() if current_directory!='C:\\Users\\Jormak\\PycharmProjects\\PANDAS_Book\\pydata-book\\ch02\\names': os.chdir('C:\\Users\\Jormak\\PycharmProjects\\PANDAS_Book\\pydata-book\\ch02\\names') all_years = DataFrame() for year in years: one_year = pd.read_csv('yob'+str(year)+'.txt',names = ['name','sex','births']) #note that read_csv can read .txt files too all_years.append(one_year) names = pd.concat(all_years, ignore_index=True) return all_years
def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame df = DataFrame(np.random.randn(5, 4), columns=['foo', 'bar', 'baz', 'qux']) dicts = [{'foo': 9}, {'bar': 10}] with tm.assert_produces_warning(None): result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected)
def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) tm.assert_index_equal(result.index, ex_index) tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() rng2 = rng.copy() rng1.name = 'foo' rng2.name = 'bar' assert rng1.append(rng1).name == 'foo' assert rng1.append(rng2).name is None
def handleBi5(infile, fileDataFrame): if os.path.getsize(infile) == 0: return fileDataFrame array = infile.split('/') print array alen = len(array) dateWithoutHour = long(datetime(int(array[alen-4]),int(array[alen-3]),int(array[alen -2])).strftime("%s")) dateWithoutMilisec = (dateWithoutHour+int(array[alen-1].split('_')[0].split('h')[0])*3600)*1000 subprocess.call("xz -dkc --suffix=bi5 " + infile + ">tmp.bin", shell=True) hdfDir = "./hdf/" + infile.split('/')[2] if not os.path.exists(hdfDir): os.makedirs(hdfDir) cvsFileName = hdfDir + "/" + infile.split('/')[3] if fileDataFrame.empty: if os.path.exists(cvsFileName): fileDataFrame = read_csv(cvsFileName, index_col=0) else: fileDataFrame = DataFrame() fileDataFrame = fileDataFrame.append(processBinFile("tmp.bin", dateWithoutMilisec)) print fileDataFrame.iloc[0] return fileDataFrame
def read_data(features, feat_path='out'): frame = DataFrame() for data_path, data_ids in data_paths_and_ids: frame = frame.append(frame_for_id(features, feat_path, data_ids, data_path)) return frame
def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=['foo', 'bar', 'baz', 'qux']) dicts = [x.to_dict() for idx, x in df.iterrows()] result = df.append(dicts, ignore_index=True) expected = df.append(df, ignore_index=True) assert_frame_equal(result, expected) # different columns dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected)
class matchbox: def __init__(self, articlepaths): self.num_exports = 0 self.num_articles_total = len(articlepaths) self.num_articles_matched = 0 self.num_matches = 0 self.dataframe = DataFrame() self.init_time = time.strftime("%Y-%m-%d_%H-%M-%S_") def update(self, matches): self.dataframe = self.dataframe.append(matches, ignore_index=True) self.num_articles_matched += 1 self.num_matches += len(matches) print('Matched {} places in article {} of {} ({:.2%} complete). ' 'Total: {}.'.format(len(matches), self.num_articles_matched, self.num_articles_total, self.num_articles_matched / self.num_articles_total, self.num_matches)) def empty_into_csv(self): self.num_exports += 1 outname = outdir + self.init_time + 'pubs_aegypti_' + str(self.num_exports) + '.csv' self.dataframe.to_csv(outname, encoding='utf-8') print('Wrote matches from chunk {} to {}.'.format(self.num_exports, outname)) del self.dataframe self.dataframe = DataFrame()
def frame_for_id(features, feat_path='out', data_ids=sts.sts12.train_ids, data_dir='STS2012-train'): frame = DataFrame() for data_id in data_ids: data = {} for feat_id in features: data_id_dir = data_id[9:] if data_id.startswith("surprise.") else data_id feat_fn = os.path.join(feat_path, data_dir, data_id_dir, "%s.txt" % feat_id) data[feat_id] = series_from_feat(feat_fn) new_frame = DataFrame(data) new_frame['data_id'] = data_id gs_fn = os.path.join(repos_dir, 'data', data_dir, "STS.gs.%s.txt" % data_id) if os.path.exists(gs_fn): new_frame['gs'] = Series(loadtxt(gs_fn)) else: new_frame['gs'] = None frame = frame.append(new_frame) frame['data_set'] = data_dir return frame
def _extract_data(file_name, filters, fields=None, summary=None, classname='Table', mode='walk', hash=''): ''' Not meant for direct use. This is broken out of :func:`extract_data` so we can wrap the code in a caching decorator to speed up loading of data from disk. The hash is created by :func:`extract_data` to ensure that the cache is cleared if the last modified time changes. Note that if you move the file to a different folder, this does not clear the cache. ''' log.info('... No cached copy of data found, reloading data') with tables.openFile(file_name, 'r') as h: data = DataFrame() if mode == 'walk': iterator = walk_nodes(h.root, filters, classname) elif mode == 'pattern': iterator = p_iter_nodes(h.root, filters) else: raise ValueError, 'Unsupported mode {}'.format(mode) for node in iterator: log.info('... Found node %s', node._v_pathname) if type(node) == tables.Table: frame = extract_node_data(node, fields, summary) data = data.append(frame, ignore_index=True) else: raise NotImplementedError return data
def train_data_construct(bins, train_set, iteration, realtime = False): train_bins = defaultdict(tuple) print 'start to construct the train data bins' if realtime: idx = 0 for bin in bins: if len(bin) > 0: feature_bin = DataFrame() lable_bin = Series() for uid in bin: tmp = train_set[train_set['product_uid'] == int(uid)] if not tmp.empty: feature_bin = feature_bin.append(tmp) # should drop the relevance data here lable_bin = lable_bin.append(tmp['relevance']) train_bins[idx] = (feature_bin,lable_bin) print len(train_bins[idx][0]), ' entries in bin', idx # if idx == 0: # feature_bin.to_csv('feature_bin.csv') idx += 1 f1 = file('../data/train_bins'+str(iteration)+'.pkl','wb') pk.dump(train_bins,f1) else: f1 = file('../data/train_bins'+str(iteration)+'.pkl','rb') train_bins=pk.load(f1) print 'finish constructing training bins' return train_bins
def test_append_length0_frame(self): df = DataFrame(columns=['A', 'B', 'C']) df3 = DataFrame(index=[0, 1], columns=['A', 'B']) df5 = df.append(df3) expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) assert_frame_equal(df5, expected)
def main(): logger = get_root_logger() get_header(logger, 'LOADING PROJECTIONS') client = APIClient() # grab dataframe shape from a trial run data = client.get_data('weekly-projections', 'json', 'QB') test_df = json_normalize(data['Projections']) # get DF structure from columns in test_df cols = test_df.columns df = DataFrame(columns=cols) # grab current week current_week = test_df.week.values[0] # loop through all weeks up to current week for wk in [str(x) for x in range(int(current_week))]: logger.info('Processing projections for week {0}'.format(int(wk) + 1)) # loop through all positions for pos in ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']: tmp_data = client.get_data('weekly-projections', 'json', pos, wk) tmp_df = json_normalize(tmp_data['Projections']) df = df.append(tmp_df) # import this df directly to PG DB conn = DBClient() conn.load(df, 'projections', schema='raw', if_exists='replace')
def test_append_dtype_coerce(self): # GH 4993 # appending with datetime will incorrectly convert datetime64 import datetime as dt from pandas import NaT df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], columns=['start_time']) df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)]], columns=['start_time', 'end_time']) expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), dt.datetime(2013, 1, 4, 7, 10)], name='end_time'), Series([dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], name='start_time')], axis=1) result = df1.append(df2, ignore_index=True) assert_frame_equal(result, expected)
def OnRtnTrade(self, Trade): """成交回报""" # print('OnRtnTrade:', Trade) print('OnRtnTrade:\n', Utils.code_transform(Trade)) PyCTP_Trader_API.dfOnRtnTrade = DataFrame.append(PyCTP_Trader_API.dfOnRtnTrade, other=Utils.code_transform(Trade), ignore_index=True)
def test(dataset, overshoot_threshold): from numpy import where, zeros from sklearn.neighbors.kde import KernelDensity folder = make_issue_specific_figure_folder('108 cluster after removing outliers', dataset) fit, par, gen, ids = IO.load_pickled_generation_dataframe(dataset) o = where(fit.overshoot > overshoot_threshold)[0] #not_o = where(fit.overshoot <= overshoot_threshold)[0] par = par.drop(o) fit = fit.drop(o) g1 = par.groupby('ssmm_nAgents').groups.keys() g2 = par.groupby('ssmm_latency_mu').groups.keys() #stdev_mean = zeros((len(g1), len(g2))) data = DataFrame(columns=['ssmm_nAgents', 'ssmm_latency_mu', 'stdev_mean']) for a, ssmm_nAgents in enumerate(g1): print ssmm_nAgents for l, ssmm_latency_mu in enumerate(g2): row = dict() try: row['stdev_mean'] = fit[(par['ssmm_latency_mu'] == ssmm_latency_mu) & (par['ssmm_nAgents'] == ssmm_nAgents)]['stdev'].mean() row['ssmm_nAgents'] = ssmm_nAgents row['ssmm_latency_mu'] = ssmm_latency_mu #print row data = data.append(row, ignore_index = True) except TypeError: print "ARGHS" X, Y = np.meshgrid(g1.groups.keys(), g2.groups.keys()) xy = np.vstack([Y.ravel(), X.ravel()]).T return data
def getFeatures(filename): csvfile = pd.read_csv(filename) # Reading .csv files containing tweets. tweet_ids = csvfile["id_str"] # Copying the 'id_str' attribute values to a item. length = len(tweet_ids) # Getting the length of 'tweet_ids'. df = DataFrame(d, index=[0]) # Creating a DataFrame twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2) ACCESS_TOKEN = twitter.obtain_access_token() twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) # Generating Access Token for i in range(0, length): status = twitter.show_status(id=tweet_ids[i]) d["id"] = status["id_str"].encode("utf-8") d["created_at"] = status["created_at"].encode("utf-8") d["from_user"] = status["user"]["screen_name"].encode("utf-8") d["followers_count"] = status["user"]["followers_count"] d["friends_count"] = status["user"]["friends_count"] d["statuses_count"] = status["user"]["statuses_count"] d["verified"] = status["user"]["verified"] d["location"] = 0 if (len(status["user"]["location"].encode("utf-8")) == 0) else 1 d["text"] = status["text"].encode("utf-8") d["retweet_count"] = status["retweet_count"] d["favorite_count"] = status["favorite_count"] d["hashtag_count"] = len(status["entities"]["hashtags"]) d["url_count"] = len(status["entities"]["urls"]) d["mentions_count"] = len(status["entities"]["user_mentions"]) if len(status["entities"]["urls"]) > 0: for x in range(0, len(status["entities"]["urls"])): d["links"] += status["entities"]["urls"][x]["expanded_url"].encode("utf-8") + " " df = df.append(d, ignore_index=True) df.to_csv("NSamples.csv") # Saving file to disk d["links"] = "" print "\nAll Done!"
def prepare_modeling_results(model,all_data,expnoList,path,exptitle): results_table = DataFrame(columns=['h','X','X model','S', 'S model', 'P', 'P model', 'expno', 'XnOBS', 'XnPRED', 'SnOBS', 'SnPRED', 'PnOBS', 'PnPRED']) for dataset1,expno1 in zip(all_data,expnoList): results_table1 = model.simulation(dataset1, expno=expno1) # calculate normalized values # XnOBS, XnPRED = feature_scaling(results_table1['X'].values,results_table1['X model'].values) # SnOBS, SnPRED = feature_scaling(results_table1['S'].values,results_table1['S model'].values) # PnOBS, PnPRED = feature_scaling(results_table1['P'].values,results_table1['P model'].values) # standardization XnOBS, XnPRED = zero_mean_variance(results_table1['X'].values,results_table1['X model'].values) SnOBS, SnPRED = zero_mean_variance(results_table1['S'].values,results_table1['S model'].values) PnOBS, PnPRED = zero_mean_variance(results_table1['P'].values,results_table1['P model'].values) # and add them to the table as new columns results_table1['XnOBS'] = XnOBS results_table1['XnPRED'] = XnPRED results_table1['SnOBS'] = SnOBS results_table1['SnPRED'] = SnPRED results_table1['PnOBS'] = PnOBS results_table1['PnPRED'] = PnPRED # now add the current experiment to the big table of all experiments results_table = results_table.append(results_table1) results_table.to_html("{0}results_table_{1}.html".format(path,exptitle)) return results_table
def parse_page(html): #html解析 soup = BeautifulSoup(html,"lxml") #提取微博文本 text=soup.find_all(attrs={"node-type":"feed_list_content","class":"WB_text W_f14"}) #提取转发部分 forward=soup.find_all(attrs={"node-type":"forward_btn_text"}) #提取评论部分 comment=soup.find_all(attrs={"node-type":"comment_btn_text"}) #提取日期 date=soup.find_all(attrs={"node-type":"feed_list_item_date"}) #提取来源平台 source=soup.find_all(attrs={"action-type":"app_source"}) #提取点赞数 like=soup.select('li a[title="赞"]') #删除无关信息 for each in date: if each.has_attr('suda-data')==False: date.remove(each) for each in source: if each.has_attr('suda-uatrack')==False: source.remove(each) for each in like: if each.has_attr('suda-uatrack')==False: like.remove(each) #构建数据字典 wb_de=[] wb_al={} wb_al=wb_al.fromkeys(wb_de,[]) wb_fr=DataFrame(wb_al,index=[]) for i in range(len(text)): all_weibo={"text":text_list(text)[i],"date":text_list(date)[i],"source":text_list(source)[i],"forward":text_list(forward)[i],"comment":text_list(comment)[i],"like":text_list(like)[i]} wb_fr=wb_fr.append(all_weibo,ignore_index=True) return wb_fr
def convertToPutJson(csv_file): df = cleanColumns(read_csv(csv_file)) putColumns = ["method", "recordId", "body"] putDf = DataFrame(columns = putColumns) for recordId in df.index: print "Converting data for recordId {recordId}...".format(recordId = recordId) body = {} for col in df.columns: body[str(col).strip()] = [str(df[col][recordId]).strip()] putDfRow = DataFrame([["PUT", str(recordId), body]], columns = putColumns) putDf = putDf.append(putDfRow) json_file = sub("csv|txt", "json", csv_file) putDf.to_json(json_file, orient="records") with open(json_file, 'r') as target: putData = target.read() target = open(json_file, 'w') putData = putData.replace("},{", "}\n\n{")[1:-1] target.write(putData) target.close() print "Successfully created put data!" return json_file
df[df['target'] == 1].shape[0] ####imbalanced data!!! #import pandas_ml as pdml import random from pandas import Series, DataFrame d_class0 = df[df['target'] == 0] d_class1 = df[df['target'] == 1] numRows_class0 = len(d_class0.index) numRows_class1 = len(d_class1.index) # downsample the class 0 d_class0_downsampled = d_class0.sample(n=numRows_class1, replace=False, random_state=42) # new output data frame containing 1:1 class ratios data_set = DataFrame() data_set = data_set.append(d_class0_downsampled) data_set = data_set.append(d_class1) # shuffle the rows numRows_data_set = len(data_set.index) data_set = data_set.sample(n=numRows_data_set, replace=False)
cre_path = tkinter.filedialog.askopenfilenames() root.destroy() # for root, dirs, files in os.walk(r''+cre_path+''): # pass files=[] for i in cre_path: files.append(i.split('/')[-1]) ##################################################### if cam_tye=='1': if cre_frmat=='1': # same+video # 复制行 new_data=DataFrame() for i in range(len(files)): new_data=new_data.append(raw_data) # Creative Type 赋值 new_data['Creative Type']='Video Page Post Ad' for file_index in range(len(files)): # creative name 每step行数 = file new_data['Video File Name'][row_num*file_index:row_num*(file_index+1)]=files[file_index] # campaign name= oldcampaign name + creatvie name new_data['Campaign Name'][row_num*file_index:row_num*(file_index+1)]=new_data['Campaign Name'][row_num*file_index:row_num*(file_index+1)]+'_'+files[file_index] # ad_set name = old ad_set name + creatvie name new_data['Ad Set Name']=new_data['Ad Set Name']+'_'+new_data['Video File Name'] # ad name = creative name
def test_drop_duplicates(): pdf = DataFrame( { "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], "B": ["one", "one", "two", "two", "two", "two", "one", "two"], "C": [1, 1, 2, 2, 2, 2, 1, 2], "D": range(8), } ) gdf = cudf.DataFrame.from_pandas(pdf) # single column result = gdf.copy() result.drop_duplicates("AAA", inplace=True) expected = pdf.copy() expected.drop_duplicates("AAA", inplace=True) assert_df(result, expected) result = gdf.drop_duplicates("AAA", keep="last") expected = pdf.drop_duplicates("AAA", keep="last") assert_df(result, expected) result = gdf.drop_duplicates("AAA", keep=False) expected = pdf.drop_duplicates("AAA", keep=False) assert_df(result, expected) assert len(result) == 0 # multi column expected = pdf.loc[[0, 1, 2, 3]] result = gdf.drop_duplicates(np.array(["AAA", "B"])) assert_df(result, expected) result = pdf.drop_duplicates(np.array(["AAA", "B"])) assert_df(result, expected) result = gdf.drop_duplicates(("AAA", "B"), keep="last") expected = pdf.drop_duplicates(("AAA", "B"), keep="last") assert_df(result, expected) result = gdf.drop_duplicates(("AAA", "B"), keep=False) expected = pdf.drop_duplicates(("AAA", "B"), keep=False) assert_df(result, expected) # consider everything df2 = gdf.loc[:, ["AAA", "B", "C"]] result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(["AAA", "B"]) assert_df(result, expected) result = df2.drop_duplicates(keep="last") expected = df2.drop_duplicates(["AAA", "B"], keep="last") assert_df(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(["AAA", "B"], keep=False) assert_df(result, expected) # integers result = gdf.drop_duplicates("C") expected = pdf.drop_duplicates("C") assert_df(result, expected) result = gdf.drop_duplicates("C", keep="last") expected = pdf.drop_duplicates("C", keep="last") assert_df(result, expected) gdf["E"] = gdf["C"].astype("int8") result = gdf.drop_duplicates("E") pdf["E"] = pdf["C"].astype("int8") expected = pdf.drop_duplicates("E") assert_df(result, expected) result = gdf.drop_duplicates("E", keep="last") expected = pdf.drop_duplicates("E", keep="last") assert_df(result, expected) pdf = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = DataFrame([[1, 0], [0, 2]]) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = DataFrame([[-2, 0], [0, -4]]) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) x = np.iinfo(np.int64).max / 3 * 2 pdf = DataFrame([[-x, x], [0, x + 4]]) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = DataFrame([[-x, x], [x, x + 4]]) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) pdf = DataFrame([i] * 9 for i in range(16)) pdf = pdf.append([[1] + [0] * 8], ignore_index=True) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates())
def test_drop_duplicates(): df = DataFrame( { "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], "B": ["one", "one", "two", "two", "two", "two", "one", "two"], "C": [1, 1, 2, 2, 2, 2, 1, 2], "D": range(8), } ) # single column result = df.drop_duplicates("AAA") expected = df[:2] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("AAA", keep="last") expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("AAA", keep=False) expected = df.loc[[]] tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column expected = df.loc[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(["AAA", "B"])) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) result = df.drop_duplicates(("AAA", "B"), keep="last") expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates(("AAA", "B"), keep=False) expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # consider everything df2 = df.loc[:, ["AAA", "B", "C"]] result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep="last") expected = df2.drop_duplicates(["AAA", "B"], keep="last") tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(["AAA", "B"], keep=False) tm.assert_frame_equal(result, expected) # integers result = df.drop_duplicates("C") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("C", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) df["E"] = df["C"].astype("int8") result = df.drop_duplicates("E") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) result = df.drop_duplicates("E", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) # GH 11376 df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) df = DataFrame([[1, 0], [0, 2]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-2, 0], [0, -4]]) tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 df = DataFrame([[-x, x], [0, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) df = DataFrame([[-x, x], [x, x + 4]]) tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ["first", "last", False]: assert df.duplicated(keep=keep).sum() == 0
def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df.append(series, verify_integrity=True) series.name = None msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df.append(series, verify_integrity=True) result = df.append(series[::-1], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1] }, index=df.columns).T, ignore_index=True) tm.assert_frame_equal(result, expected) # dict result = df.append(series.to_dict(), ignore_index=True) tm.assert_frame_equal(result, expected) result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1][:3] }).T, ignore_index=True, sort=True) tm.assert_frame_equal(result, expected.loc[:, result.columns]) msg = "Can only append a dict if ignore_index=True" with pytest.raises(TypeError, match=msg): df.append(series.to_dict()) # can append when name set row = df.loc[4] row.name = 5 result = df.append(row) expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected)
fid = "stn_gpe_beta" dv = 'p' ivs = ["tau_e", "tau_p", "tau_ampa_r","tau_ampa_d", "tau_gabaa_r", "tau_gabaa_d", "tau_stn", "eta", "delta", "k", "eta_e", "eta_p", "k_pe", "k_ep", "k_pp"] # load data into frame df = DataFrame(data=np.zeros((1, len(ivs))), columns=ivs) df_dv = DataFrame(data=np.zeros((1, 1)), columns=["fitness"]) for d in directories: for fn in os.listdir(d): if fn.startswith(fid) and fn.endswith('.h5'): f = h5py.File(f"{d}/{fn}", 'r') index = int(fn.split('_')[-2]) if fn.endswith("params.h5"): df_tmp = DataFrame(data=np.asarray([[f[dv][key][()] for key in ivs]]), columns=ivs, index=[index]) df = df.append(df_tmp) elif fn.endswith("fitness.h5"): df_tmp = DataFrame(data=np.asarray([1/f["f"][()]]), columns=["fitness"], index=[index]) df_dv = df_dv.append(df_tmp) df = df.iloc[1:, :] df_dv = df_dv.iloc[1:, :] #df['fitness'] = df_dv.loc[:, "fitness"] # create feature matrix and target vector y = np.squeeze(df_dv.values) X = np.asarray([df.pop(iv) for iv in ivs]).T # perform dimensionality reduction on data # n_comps = 5 # dim_red = Isomap(n_components=n_comps, n_neighbors=10, p=2) # X_ld = dim_red.fit_transform(X, y)
len(db) db[0].keys() db[0]['nutrients'][0] nutrients = DataFrame(db[0]['nutrients']) nutrients[:7] info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) info[:5] info info.group.value_counts() nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) nutrients nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) info ndata = pd.merge(nutrients, info, on='id', how='outer') ndata ndata.ix[30000] result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
def checkChangeTriggerResult(owner, repo): """检查PRChangeTrigger是否计算完整""" """在切换代理的时候,数据库连接会断开,导致comments信息查不到,会遗漏review comment的情况""" """这里检查一遍pr的change_trigger里是否有review_comment数据,如果没有,重新获取一次""" """PRTimeLine表头""" PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node", "comment_type", "change_trigger", "filepath"] """初始化目标文件""" target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_pr_change_trigger.tsv' """1. 获取该仓库所有的pr_node""" # repo_fullname = configPraser.getOwner() + "/" + configPraser.getRepo() # pr_nodes = AsyncProjectAllDataFetcher.getPullRequestNodes(repo_fullname) # pr_nodes = list(pr_nodes) # pr_nodes = [node[0] for node in pr_nodes] """需要获取的prs改为有issue 额 review的timeline的pr""" timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0) timeline_df = timeline_df.loc[(timeline_df['typename'] == 'IssueComment') \ | (timeline_df['typename'] == 'PullRequestReview')].copy(deep=True) pr_nodes = list(set(timeline_df['pullrequest_node'])) """2. 读取pr_change_trigger文件""" change_trigger_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv' change_trigger_df = pandasHelper.readTSVFile(fileName=change_trigger_filename, header=0) change_nodes = list(set(change_trigger_df['pullrequest_node'])) # """3. 读取pr_timeline文件""" # timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{configPraser.getRepo()}_data_prtimeline.tsv' # timeline_df = pandasHelper.readTSVFile(fileName=timeline_filename, header=0) """4. 将change_trigger按照pull_request_node分组""" grouped_timeline = change_trigger_df.groupby((['pullrequest_node'])) """5. 分析pullrequest_node的change_trigger信息是否完整,整理出需要重新获取的pr信息""" re_analyze_prs = [x for x in pr_nodes if x not in change_nodes] # for pr, group in grouped_timeline: # if pr not in pr_nodes: # re_analyze_prs.append(pr) # else: # review_comment_trigger = group.loc[(group['comment_type'] == StringKeyUtils.STR_LABEL_REVIEW_COMMENT) & (group['change_trigger'] >= 0)] # if review_comment_trigger is None or review_comment_trigger.empty: # re_analyze_prs.append(pr) # Logger.logi("there are {0} prs need to re analyze".format(re_analyze_prs.__len__())) """读取PullRequestData,获取pr所对应的作者""" pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv' pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """收集pr已经对应的作者 用于后面过滤属于作者评论""" pr_author_map = {} for index, row in pr_data_df.iterrows(): pr_author_map[row['node_id']] = row['user_login'] """设置fetch参数""" pos = 0 fetchLimit = 200 size = re_analyze_prs.__len__() while pos < size: Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size)) sub_re_analyze_prs = re_analyze_prs[pos:pos + fetchLimit] """6. 重新获取这些pr的timeline""" re_analyze_prs_timeline_df = timeline_df[timeline_df['pullrequest_node'].isin(sub_re_analyze_prs)] grouped_timeline = re_analyze_prs_timeline_df.groupby((['pullrequest_node'])) formated_data = [] for pr, group in grouped_timeline: formated_data.append(group.to_dict(orient='records')) """7. 开始分析""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map) pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y] """8. 将分析结果去重并追加到change_trigger表中""" if pr_change_trigger_comments is not None and pr_change_trigger_comments.__len__() > 0: target_content = DataFrame() target_content = target_content.append(pr_change_trigger_comments, ignore_index=True) target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True) target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first') if not target_content.empty: pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, header=pandasHelper.INT_WRITE_WITHOUT_HEADER) Logger.logi("successfully analyzed {0} prs".format(re_analyze_prs.__len__())) pos += fetchLimit
f.close() content = NEWLINE.join(lines) yield file_path, content SAD = 'Sad' HAPPY = 'Happy' SOURCES = [(r'/Users/Rini/Documents/training_happy_sad/happy', HAPPY), (r'/Users/Rini/Documents/training_happy_sad/sad', SAD), (r'/Users/Rini/Documents/validation_happy_sad/happy', HAPPY), (r'/Users/Rini/Documents/validation_happy_sad/sad', SAD)] data = DataFrame({'text': [], 'class': []}) for path, classification in SOURCES: data = data.append(build_data_frame(path, classification)) data = data.reindex(numpy.random.permutation(data.index)) count_vectorizer = CountVectorizer() counts = count_vectorizer.fit_transform(data['text'].values) classifier = MultinomialNB() targets = data['class'].values classifier.fit(counts, targets) pipeline = Pipeline([('count_vectorizer', CountVectorizer(ngram_range=(1, 2))), ('tfidf_transformer', TfidfTransformer()), ('classifier', MultinomialNB())]) from sklearn.cross_validation import KFold from sklearn.metrics import confusion_matrix, f1_score
#Only keep US locations for merge and rename variables ihme_locations = ihme_locations[( ihme_locations['region_name'] == str('High-income North America'))] ihme_locations = ihme_locations.loc[:, ['location_name', 'ihme_loc_id' ]] #Only keep variables needed for merge ihme_locations = ihme_locations.rename(columns={'location_name': 'area_title'}) #Loop through years to produce individually cleaned datasets from BLS. Will append them all together at end for year in range(1990, 2015): QCEW = QCEW.append( pd.read_csv(open( r'C:/Users/strUser/Work/Data/QCEW/{date}.annual.singlefile.csv'. format(date=year)), usecols=[ "area_fips", "industry_code", "annual_avg_emplvl", "own_code", "year" ], dtype={'area_fips': np.str})) #Clean all datasets before merging ##Hold onto only relevant variables for GBD QCEW = QCEW[(QCEW.own_code == 0) | ( (QCEW.own_code == 5) & (QCEW.industry_code != '10') )] #Drops distinctions the QCEW study made concerning industry types QCEW = QCEW.drop('own_code', axis=1) #Add on human readable names QCEW = pd.merge(QCEW, industry_titles, on='industry_code')
yield path, message def dataFrameFromDirectory(path, classification): rows = [] index = [] for filename, message in readFiles(path): rows.append({'message': message, 'class': classification}) index.append(filename) return DataFrame(rows, index=index) data = DataFrame({'message': [], 'class': []}) data = data.append(dataFrameFromDirectory('/emails/spam', 'spam')) data = data.append(dataFrameFromDirectory('/emails/ham', 'ham')) #Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy. vectorizer = CountVectorizer() counts = vectorizer.fit_transform(data['message'].values) classifier = MultinomialNB() targets = data['class'].values classifier.fit(counts, targets) #let's test it examples = [ 'Free Chocolates now!!!', "Hi Bob, how about a game of golf tomorrow?" ] example_counts = vectorizer.transform(examples) predictions = classifier.predict(example_counts) print predictions
index=['a', 'b', 'c', 'd']) obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) obj = Series([1, -2, 3, -4], index=[0, 2, 3, 5]) obj2 = obj.reindex(range(6), method='ffill') df = DataFrame(np.arange(9).reshape(3, 3), index=['a', 'c', 'd'], columns=['name', 'id', 'se']) df2 = df.reindex(['a', 'b', 'c', 'd']) df3 = df.reindex(columns=['name', 'year', 'id'], fill_value=0) data2 = {'name': ['张三', '李四', '王五', '小明'], 'grade': [68, 78, 63, 92]} df = DataFrame(data2) df2 = df.sort_values(by='grade') new_data = {'city': '武汉', 'name': '小李', 'sex': 'male', 'year': 2002} df = df.append(new_data, ignore_index=True) #或略索引值 ''' name grade city sex year 0 张三 68.0 NaN NaN NaN 1 李四 78.0 NaN NaN NaN 2 王五 63.0 NaN NaN NaN 3 小明 92.0 NaN NaN NaN 4 小李 NaN 武汉 male 2002.0 ''' # 增加列 df['class'] = 2018 ''' name grade city sex year class 0 张三 68.0 NaN NaN NaN 2018 1 李四 78.0 NaN NaN NaN 2018
def _forcedPhotometry( self, objects: pandas.DataFrame, latest_objects: pandas.DataFrame, dt: DateTime, visit_id: int) -> Tuple[pandas.DataFrame, pandas.DataFrame]: """Do forced photometry on latest_objects which are not in objects. Extends objects catalog with new DiaObjects. Parameters ---------- objects : `pandas.DataFrame` Catalog containing DiaObject records latest_objects : `pandas.DataFrame` Catalog containing DiaObject records dt : `DateTime` Visit time. visit_id : `int` Visit ID. """ midPointTai = dt.get(system=DateTime.MJD) if objects.empty: return pandas.DataFrame( columns=["diaObjectId", "ccdVisitId", "midPointTai", "flags" ]), objects # Ids of the detected objects ids = set(objects['diaObjectId']) # do forced photometry for all detected DiaObjects df1 = pandas.DataFrame({ "diaObjectId": objects["diaObjectId"], "ccdVisitId": visit_id, "midPointTai": midPointTai, "flags": 0, }) # do forced photometry for non-detected DiaObjects (newer than cutoff) o1 = cast(pandas.DataFrame, latest_objects[~latest_objects["diaObjectId"].isin(ids)]) # only do it for 30 days after last detection cutoff = dt.toPython() - timedelta(days=self.config.forced_cutoff_days) o1 = cast(pandas.DataFrame, o1[o1["lastNonForcedSource"] > cutoff]) if o1.empty: catalog = df1 else: df2 = pandas.DataFrame({ "diaObjectId": o1["diaObjectId"], "ccdVisitId": visit_id, "midPointTai": midPointTai, "flags": 0, }) # extend forced sources catalog = pandas.concat([df1, df2]) # also extend objects o2 = pandas.DataFrame({ "diaObjectId": o1["diaObjectId"], "ra": o1["ra"], "decl": o1["decl"], }) objects = objects.append(o2) return catalog, objects
def test_other_dtypes(self, data, dtype): df = DataFrame(data, dtype=dtype) result = df.append(df.iloc[0]).iloc[-1] expected = Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected)
def compute_entrainment(self, ctfind=False): ''' This function computes the histogram and average w for use in our computation of the w'phi' calculation Here we compute the histogram that will allow us to average w over the whole bin for use in the RMS calculation bin frequency is set by h_freq usually 2Hz which corresponds to around _______ vertical change. ''' # These three structures will be our outputs # This one is for full diagnostics (Note Eflux includes only qt here) ent_data = DataFrame(columns=['leg', 'eflux', 'dphi', 'wprime']) # This one holds the ent flux for each leg ent_flux = {p: [] for p in self.phi_raw.keys()} # This one is for the average flux, used in budgeting avg_flux = {p: None for p in self.phi_raw.keys()} # hist. frequency dt = self.const['hist_width'] # hist. dimensions # averaged v_velocity data structure flight_legs = {leg: None for leg in self.ein_b.keys()} # this structure will hold the (w', zbar) mapping for each leg of the flight. it's primary use is for visuals wp_mapping = {} # Here we'll determine the cloud-top buffer zone and qt flux jump for each of the flight legs # Load the ctop_fname = "/home/mrmisanthropy/Projects/fase/fase/flights/{}/CloudtopHeights_{}.json".format( self.date, self.date) try: cloud_top = read_json(ctop_fname, orient='index') except FileNotFoundError: print("Ct file not found, run ct determination?") raise FileNotFoundError # Here's a quick way check of our ct determination # for leg in flight_legs.keys(): # leg_data = self.en_cbfile[self.ein_b[leg]:self.ein_e[leg]] # ct_entry = cloud_top.loc[leg, :] # self.ct_check(leg_data, ct_entry, leg, save=True, show=False) for leg in flight_legs.keys(): # Take a slice of the data for the leg leg_data = self.en_cbfile[self.ein_b[leg]:self.ein_e[leg]] # Create histogram of arrival times for averaging ts_data = leg_data['time'] dim = {'dn': dt} time_series = create_histogram(dim, ts_data) leg_map = [] for i in range(time_series['dim']['N']): ind_list = time_series['index'][i] events = leg_data[leg_data.index.isin(ind_list)] wbar = np.average(events['w']) zbar = np.average(events['alt']) w_prime_sq = np.average((events['w'] - wbar)**2) wprime = np.sqrt(w_prime_sq) leg_map.append((wprime, zbar)) leg_map = DataFrame(leg_map, columns=["w_prime", "alt"]) wp_mapping[leg] = leg_map # Now using our estimation of cloudtop, compute wprime at the altitude of the center of the buffer layer leg_ctop = cloud_top.loc[leg, :] z_top = leg_ctop['cld_bin'][1] wp = leg_map.iloc[(leg_map['alt'] - z_top).abs().argsort()[0]] # This bit computes only for qt, its used for diagnostics leg_eflux = {} leg_eflux['leg'] = leg leg_eflux['eflux'] = wp['w_prime'] * leg_ctop['Dphi']['qt'] leg_eflux['dphi'] = leg_ctop['Dphi']['qt'] leg_eflux['wprime'] = wp['w_prime'] ent_data = ent_data.append(Series(leg_eflux), ignore_index=True) # Here we append the ent flux value for each phi variable for ph in self.phi_raw.keys(): ent_flux[ph].append(wp['w_prime'] * leg_ctop['Dphi'][ph]) # Here we average each ent flux for ph in self.phi_raw.keys(): avg_flux[ph] = np.average(ent_flux[ph]) return avg_flux, ent_flux, ent_data
# # X_important_train = sfm.transform(X_train) # X_important_test = sfm.transform(x_test) # # clf_important = RandomForestClassifier(n_estimators=5000, max_depth=5, min_samples_leaf=10) # # # Train the new classifier on the new dataset containing the most important features # clf_important.fit(X_important_train, y_train) predictions = model.predict(x_test) report = classification_report(y_test, predictions, output_dict=True) acc = report['accuracy'] f1sc = report['weighted avg']['f1-score'] # calculate and write the mean and std_dev of the average & f1-score df_all = df_all.append( { 'channel': channel, 'segment': 'spontaneous&stimulus', 'accuracy': acc, 'f1-score': f1sc }, ignore_index=True) # print('debug') df_all.to_csv(csv_file, mode='a', header=False) df_all = df_all.iloc[0:0] write_file.close()
return df if __name__ == "__main__": CMC_URL = "https://coinmarketcap.com/" CSV_PATH = "/CoinMarketCapData.csv" response = requests.get(CMC_URL + "historical/") soup = BeautifulSoup(response.content, "lxml") df = DataFrame() for partial_historical_link in get_historical_links(soup): historical_link = CMC_URL + partial_historical_link try: response = requests.get(historical_link) soup = BeautifulSoup(response.content, "lxml") new_df = parse_tables(soup) date = parse_date(soup) new_df['date'] = date df = df.append(new_df) print("Downloaded: " + date) except: print("ERROR: Unable to parse " + historical_link) df.to_csv(CSV_PATH, index=False)
def getCorrelation(input_image, angles, dist): """ Loops through a list of given angles, finding the correlation value at each angle. :param input_image: a single grayscale image to analyze :param angles: a list of angles in radians (usually from 0 to pi) to measure correlation at :param dist: a single distance parameter to measure correlation at :return: An array called "all_corr" containn the correlation values at every angle. """ final_ds = DataFrame(columns=[ 'img_id', 'mean', 'std', 'kurtosis', 'skew', 'entropy', 'contrast', 'dissimilarity', 'energy', 'ASM', 'homogeneity', 'correlation' ]) img_id = 0 image = input_image all_corr = [] for angle in angles: glcm = greycomatrix(image=image, distances=[dist], angles=[angle], levels=256, symmetric=True, normed=True) t = { 'img_id': [img_id], 'mean': [numpy.average(image)], 'std': [numpy.std(image)], 'kurtosis': [kurtosis(image.flatten())], 'skew': [skew(image.flatten())], 'entropy': [shannon_entropy(glcm, base=numpy.e)], 'contrast': [greycoprops(glcm, 'contrast')[0, 0]], 'dissimilarity': [greycoprops(glcm, 'dissimilarity')[0, 0]], 'energy': [greycoprops(glcm, 'energy')[0, 0]], 'ASM': [greycoprops(glcm, 'ASM')[0, 0]], 'homogeneity': [greycoprops(glcm, 'homogeneity')[0, 0]], 'correlation': [greycoprops(glcm, 'correlation')[0, 0]] } t = DataFrame(data=t, columns=[ 'img_id', 'mean', 'std', 'kurtosis', 'skew', 'entropy', 'contrast', 'dissimilarity', 'energy', 'ASM', 'homogeneity', 'correlation' ]) img_id += 1 final_ds = final_ds.append(t) corr = greycoprops(glcm, 'correlation')[0, 0] all_corr.append(corr) #Printing a bunch of data from the glcm print(final_ds) # printing the angle & corresponding correlation: print("Angle = ", angle, "Correlation = ", greycoprops(glcm, 'correlation')[0, 0]) return all_corr
student.ix['제임스','영어'] = 98 # 수정 student.loc['제임스','영어'] student ''' student = student.set_value('제임스','영어',90) # 수정 student ''' # append : DataFrame에 DataFrame을 rbind 시키기 1 student_new = DataFrame([[60,80,70],[50,75,85],[90,80,85]], index = ['윤건','김건모','이문세'], columns = ['영어','수학','국어']) student_new student = student.append(student_new) # student에 student_new 붙이기(R : rbind()) student # pd.concat : DataFrame에 DataFrame을 rbind 시키기 2 student1 = DataFrame([[60,80,70],[50,75,85],[90,80,85]], index = ['싸이','나얼','윤상'], columns = ['영어','수학','국어']) student1 student = pd.concat([student,student1]) # student에 student1 붙이기(R : rbind()) student ## row,column 추가/삭제
"time": Series([ 3.14, 4.12, 5.44, 7.69, 10.57, 13.81, 2.49, 3.48, 4.73, 7.09, 9.95, 13.17, 2.48, 3.42, 4.8, 6.95, 9.82, 13.06 ], dtype="float32"), "operation": Series([op for op in ["INSERT", "DELETE", "LOOKUP"] for _ in range(6)], dtype="category") }) all_bench_df = bst_unsafe_running_times_df.append( avl_unsafe_running_times_df, ignore_index=True).append( bst_fullextern_compilation_times_df, ignore_index=True).append( avl_fullextern_compilation_times_df, ignore_index=True).append( bst_extern_compilation_times_df, ignore_index=True).append( avl_extern_compilation_times_df, ignore_index=True).append( bst_intern_compilation_times_df, ignore_index=True).append( avl_intern_compilation_times_df, ignore_index=True) all_bench_df = all_bench_df.astype( { "bench_name": "category", "bench_type": "category", "N": "int8", "size": "int32", "time": "float32", "operation": "category" }, copy=True)
forest = forest.fit(train_independent_vars, train_dependent_vars) # Take the same decision trees and run it on the test data output = forest.predict(train_imputed[[ 'Ticket_length', 'Title', 'NameLength', 'Pclass', 'Female', 'Age', 'withfamily', 'Ticket_group', 'Fare', 'Embarked', 'Cabin_first_ltr', 'spaces_in_name' ]]) ### combine the passengerid with the prediction output_df = pd.DataFrame(test_imputed.PassengerId).join(pd.DataFrame(output)) output_df.columns = ['PassengerId', 'Survived'] #### create the final output dataframe final_output = DataFrame(columns=['PassengerId', 'Survived']) final_output = final_output.append(output_df[['PassengerId', 'Survived']]) #### convert to csv final_output.to_csv('output.csv', index=False, header=['PassengerId', 'Survived']) # In[ ]: # importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in indices:
def runtdm(infile_): print "running the code for ", infile_ tag_str = infile_.split("_13TeV")[0].split("Merged_")[1] cross_section_weight = xsweight(tag_str) print infile_, tag_str, cross_section_weight ### dataframe for output df_out = DataFrame(columns=[ 'run', 'lumi', 'event', 'MET', 'MT', 'Njets_PassID', 'Nbjets_PassID', 'ElePt', 'EleEta', 'ElePhi', 'Jet1Pt', 'Jet1Eta', 'Jet1Phi', 'Jet2Pt', 'Jet2Eta', 'Jet2Phi', 'Jet3Pt', 'Jet3Eta', 'Jet3Phi', 'Jet1Idx', 'Jet2Idx', 'Jet3Idx', 'weight' ]) df_out_wmunu_cr = DataFrame(columns=[ 'run', 'lumi', 'event', 'MET', 'MT', 'Njets_PassID', 'Nbjets_PassID', 'Jet1Pt', 'Jet1Eta', 'Jet1Phi', 'Jet2Pt', 'Jet2Eta', 'Jet2Phi', 'Jet3Pt', 'Jet3Eta', 'Jet3Phi', 'Jet1Idx', 'Jet2Idx', 'Jet3Idx', 'MuPt', 'MuEta', 'MuPhi', 'weight' ]) recoil_den = TH1F("recoil_den", "recoil_den", 100, 0.0, 1000.) recoil_num = TH1F("recoil_num", "recoil_num", 100, 0.0, 1000.) jetvariables = [ 'st_THINnJet', 'st_THINjetPx', 'st_THINjetPy', 'st_THINjetPz', 'st_THINjetEnergy', 'st_THINjetCISVV2', 'st_THINjetHadronFlavor', 'st_THINjetNHadEF', 'st_THINjetCHadEF', 'st_THINjetCEmEF', 'st_THINjetPhoEF', 'st_THINjetEleEF', 'st_THINjetMuoEF', 'st_THINjetCorrUnc', 'st_runId', 'st_lumiSection', 'st_eventId', 'st_pfMetCorrPt', 'st_pfMetCorrPhi', 'st_pfMetUncJetResUp', 'st_pfMetUncJetResDown', 'st_pfMetUncJetEnUp', 'st_pfMetUncJetEnDown', 'st_isData', 'st_HLT_IsoMu24_v', 'st_HLT_IsoTkMu24_v', 'st_HLT_IsoMu27_v', 'st_HLT_IsoTkMu27_v', 'st_HLT_Ele27_WPTight_Gsf_v', 'st_HLT_Ele27_WPLoose_Gsf_v', 'st_HLT_Ele105_CaloIdVT_GsfTrkIdT_v', 'st_HLT_Ele115_CaloIdVT_GsfTrkIdT_v', 'st_HLT_Ele32_WPTight_Gsf_v', 'st_HLT_Ele32_eta2p1_WPTight_Gsf_v', 'st_HLT_Ele27_eta2p1_WPTight_Gsf_v', 'st_THINnJet', 'st_THINjetPx', 'st_THINjetPy', 'st_THINjetPz', 'st_THINjetEnergy', 'st_THINjetCISVV2', 'st_THINjetHadronFlavor', 'st_THINjetNHadEF', 'st_THINjetCHadEF', 'st_THINjetCEmEF', 'st_THINjetPhoEF', 'st_THINjetEleEF', 'st_THINjetMuoEF', 'st_THINjetCorrUnc', 'st_nEle', 'st_elePx', 'st_elePy', 'st_elePz', 'st_eleEnergy', 'st_eleIsPassLoose', 'st_eleIsPassTight', 'st_nMu', 'st_muPx', 'st_muPy', 'st_muPz', 'st_muEnergy', 'st_isTightMuon', 'st_muIso', 'st_HPSTau_n' ] filename = infile_ ''' global variables, mainly to be stored in the new root tree for quick analysis or histo saving ''' icount = 0 df_new = DataFrame() df_all = DataFrame() ieve = 0 jetptseries = [] jetetaseries = [] jetphiseries = [] jet_pt30 = [] jet_pt50 = [] jet_eta4p5 = [] jet_IDtightVeto = [] jet_eta2p4 = [] jet_NJpt30 = [] jet_NJpt30_Eta4p5 = [] jet_csvmedium = [] jet_N_bmedium_eta2p4_pt30 = [] hlt_ele = [] met_250_ = [] for df in read_root(filename, columns=jetvariables, chunksize=125000): icount = icount + 1 ''' all the operations which should be applied to each event must be done under this loop, otherwise effect will be reflected on the last chunk only. Each chunck can be considered as a small rootfile. An example of how to add new variable and copy the new dataframe into a bigger dataframe is shown below. This is the by far fastest method I manage to find, uproot awkward arrays are even faster but difficult to use on lxplus. and may be on condor. ''' for nak4jet_, ak4px_, ak4py_, ak4pz_, ak4e_, ak4csv, ak4flavor, ak4NHEF, ak4CHEF, ak4CEmEF, ak4PhEF, ek4EleEF, ak4MuEF, ak4JEC, hlt_ele27, hlt_ele105, hlt_ele115, hlt_ele32, hlt_ele32_eta2p1, hlt_ele27_eta2p1, nele_, elepx_, elepy_, elepz_, elee_, elelooseid_, eletightid_, nmu_, mupx_, mupy_, mupz_, mue_, mutightid_, muIso_, met_, metphi_, run, lumi, event, nTau in zip( df.st_THINnJet, df.st_THINjetPx, df.st_THINjetPy, df.st_THINjetPz, df.st_THINjetEnergy, df.st_THINjetCISVV2, df.st_THINjetHadronFlavor, df.st_THINjetNHadEF, df.st_THINjetCHadEF, df.st_THINjetCEmEF, df.st_THINjetPhoEF, df.st_THINjetEleEF, df.st_THINjetMuoEF, df.st_THINjetCorrUnc, df.st_HLT_Ele27_WPLoose_Gsf_v, df.st_HLT_Ele105_CaloIdVT_GsfTrkIdT_v, df.st_HLT_Ele115_CaloIdVT_GsfTrkIdT_v, df.st_HLT_Ele32_WPTight_Gsf_v, df.st_HLT_Ele32_eta2p1_WPTight_Gsf_v, df.st_HLT_Ele27_eta2p1_WPTight_Gsf_v, df.st_nEle, df.st_elePx, df.st_elePy, df.st_elePz, df.st_eleEnergy, df.st_eleIsPassLoose, df.st_eleIsPassTight, df.st_nMu, df.st_muPx, df.st_muPy, df.st_muPz, df.st_muEnergy, df.st_isTightMuon, df.st_muIso, df.st_pfMetCorrPt, df.st_pfMetCorrPhi, df.st_runId, df.st_lumiSection, df.st_eventId, df.st_HPSTau_n): print "ievent = ", ieve ieve = ieve + 1 if debug_: print nak4jet_, ak4px_, ak4py_, ak4pz_, ak4e_ ''' ******* ***** ******* * * * * **** * * * * *** ***** * ''' ''' This small block compute the pt of the jet and add them back into the original dataframe as a next step for further usage. ''' ak4pt = [getPt(ak4px_[ij], ak4py_[ij]) for ij in range(nak4jet_)] jetptseries.append(ak4pt) ''' Jet Loose ID is already applied in the preselection ''' ''' eta and phi of the ak4 jets ''' ak4eta = [ getEta(ak4px_[ij], ak4py_[ij], ak4pz_[ij]) for ij in range(nak4jet_) ] ak4phi = [getPhi(ak4px_[ij], ak4py_[ij]) for ij in range(nak4jet_)] jetetaseries.append(ak4eta) jetphiseries.append(ak4phi) ''' pT>30 GeV, |eta|<4.5 is already applied in the tuples ''' ''' jets with pt > 30 GeV ''' ak4_pt30 = [(ak4pt[ij] > 30.) for ij in range(nak4jet_)] jet_pt30.append(ak4_pt30) ''' jets with pt > 50 GeV ''' ak4_pt50 = [(ak4pt[ij] > 50.) for ij in range(nak4jet_)] jet_pt50.append(ak4_pt50) ''' jet |eta| < 4.5 ''' ak4_eta4p5 = [(abs(ak4eta[ij]) < 4.5) for ij in range(nak4jet_)] jet_eta4p5.append(ak4_eta4p5) ''' jet |eta| < 2.4 ''' ak4_eta2p4 = [(abs(ak4eta[ij]) < 2.4) for ij in range(nak4jet_)] jet_eta2p4.append(ak4_eta2p4) ''' jet tightLeptonVeto ID to reject fake jets coming from the leptons, Veto ID should be applied only for jets within the detector abs(eta) < 2.4 Following the syntax of if else in list comprehension [f(x) if condition else g(x) for x in sequence] ''' ak4_IDtightVeto = [ ((ak4NHEF[ij] < 0.90) and (ak4PhEF[ij] < 0.90) and (ak4MuEF[ij] < 0.8) and (ak4CEmEF[ij] < 0.90) and abs(ak4eta[ij]) < 2.4) or ((ak4NHEF[ij] < 0.90) and (ak4PhEF[ij] < 0.90) and (ak4MuEF[ij] < 0.8) and abs(ak4eta[ij]) < 2.7 and abs(ak4eta[ij]) > 2.4) if (abs(ak4eta[ij]) < 2.7) else True for ij in range(nak4jet_) ] jet_IDtightVeto.append(ak4_IDtightVeto) if debug_: print "ak4_IDtightVeto", ak4_IDtightVeto ''' njets passing jet pt > 30 and eta < 4.5 and Loose Jet ID ''' jet_NJpt30.append(ak4_pt30.count(True)) jet_NJpt30_Eta4p5.append( len([ ij for ij in range(nak4jet_) if (ak4_eta4p5[ij] and ak4_pt50[ij]) ])) ak4_csvmedium = [(ak4csv[ij] > 0.8484) for ij in range(nak4jet_)] jet_csvmedium.append(ak4_csvmedium) jet_N_bmedium_eta2p4_pt30.append( len([ ij for ij in range(nak4jet_) if ((ak4_eta2p4[ij]) and ( ak4_pt30[ij]) and (ak4_csvmedium[ij])) ])) ''' **** * **** * * * *** * *** * * * **** **** **** the selection for the electron is done here, later the new branches are added to the dataframe. ''' ''' electron triggers ''' hlt_ele.append( logical_OR([ hlt_ele27, hlt_ele105, hlt_ele115, hlt_ele32, hlt_ele32_eta2p1, hlt_ele27_eta2p1 ])) if debug_: print "event ------", event ''' get pt, eta, phi of electrons ''' elept = [getPt(elepx_[ie], elepy_[ie]) for ie in range(nele_)] eleeta = [ getEta(elepx_[ie], elepy_[ie], elepz_[ie]) for ie in range(nele_) ] elephi = [getPhi(elepx_[ie], elepy_[ie]) for ie in range(nele_)] ''' electron pt and eta cut, tuples already have electron pT > 10 GeV and |eta|<2.5 Veto electron ID is also applied on the electron at preselection level ''' ele_pt10 = [(elept[ie] > 10) for ie in range(nele_)] ele_pt30 = [(elept[ie] > 30) for ie in range(nele_)] ele_IDLoose = [(elelooseid_[ie]) for ie in range(nele_)] ele_IDTight = [(eletightid_[ie]) for ie in range(nele_)] ele_eta2p1 = [(abs(eleeta[ie]) < 2.1) for ie in range(nele_)] ele_eta2p5 = [(abs(eleeta[ie]) < 2.5) for ie in range(nele_)] ele_pt10_eta2p5_vetoID = [] if len(ele_pt10) > 0: ele_pt10_eta2p5_vetoID = logical_AND_List2( ele_pt10, ele_eta2p5) if debug_: print "ele info" if debug_: print "pt, id eta =", ele_pt30, ele_IDTight, ele_eta2p1 if debug_: for ie in range(nele_): print elept[ie], eleeta[ie], eletightid_[ie], elepx_[ ie], elepy_[ie], elepz_[ie] ''' ** * * * * * * * * * * * * * * * * * * * * ***** the selection for the muon is done here, later the new columns are added to the dataframe for each of them. ''' ''' muon triggers ''' ''' muon pt threshold and eta threshold, tuples already have muon pt > 10 and |eta| < 2.4 ''' mupt = [getPt(mupx_[imu], mupy_[imu]) for imu in range(nmu_)] mueta = [ getEta(mupx_[imu], mupy_[imu], mupz_[imu]) for imu in range(nmu_) ] muphi = [getPhi(mupx_[imu], mupy_[imu]) for imu in range(nmu_)] ''' For vetoing in the electron region only Looose Mu ID and ISo with pt > 10 GeV is needed and is already applied in the skimmer ''' mu_pt10 = [(mupt[imu] > 10.0) for imu in range(nmu_)] mu_pt30 = [(mupt[imu] > 30.0) for imu in range(nmu_)] mu_eta2p4 = [(abs(mueta[imu]) < 2.4) for imu in range(nmu_)] mu_IDTight = [mutightid_[imu] for imu in range(nmu_)] mu_IsoTight = [(muIso_[imu] < 0.15) for imu in range(nmu_)] mu_pt10_eta2p4_looseID = [] if len(mu_pt10): mu_pt10_eta2p4_looseID = logical_AND_List2(mu_pt10, mu_eta2p4) ''' MET SELECTION ''' met_250_.append(met_ > 250.0) ''' MT Calculation for electrons ''' mt_ele = [ getMT(elept[ie], met_, elephi[ie], metphi_) for ie in range(nele_) ] #mt_ele_.append(mt_ele) ''' MT Calculation for muons ''' mt_mu = [ getMT(mupt[imu], met_, muphi[imu], metphi_) for imu in range(nmu_) ] #mt_mu_.append(mt_mu) ''' Event selection to count the number of events. In simple terms, index() method finds the given element in a list and returns its position. However, if the same element is present more than once, index() method returns its smallest/first position. And this is what I generally need for this code. But this fails when there is no element or no true in the list This is complicated in first look but more usable. And it will be faster once I know how to flatten the dataset. first elecment of output return by where is the location whre true is present, still not known how where actually work but this is the fastest method e.g. #if (len(ele_eta2p1)>0): ele_passlist = numpy.where(ele_eta2p1)[0] #print ele_passlist ''' ''' take AND of all the electron cuts (just take the lists) ''' ele_eta2p1_idT_pt30 = [] if (len(ele_eta2p1) > 0): ele_eta2p1_idT_pt30 = logical_AND_List3( ele_eta2p1, ele_IDTight, ele_pt30) ''' > 0 means >= 1. The selection in the function is implemented like >= not >. Therefore pay attention when using this function. The function also take care of the fact that the operation will be performed only when size of the list is >= N, where N is by default 0 and has to be provided ''' pass_ele_index = WhereIsTrue(ele_eta2p1_idT_pt30, 1) pass_veto_id_ele_index = WhereIsTrue(ele_pt10_eta2p5_vetoID, 1) mu_eta2p4_idT_pt30 = [] if (len(mu_pt30) > 0): mu_eta2p4_idT_pt30 = logical_AND_List3(mu_pt30, mu_IDTight, mu_IsoTight) pass_mu_index = WhereIsTrue(mu_eta2p4_idT_pt30, 1) ak4_pt30_eta4p5_IDL = [] if len(ak4_pt30) > 0: ak4_pt30_eta4p5_IDL = logical_AND_List2(ak4_pt30, ak4_eta4p5) ''' we need at least 3 jets passing id, so we must ensure presene of 3 jets atleast ''' pass_jet_index = WhereIsTrue(ak4_pt30_eta4p5_IDL, 3) ''' All the object selection is done before this, region specific cuts are here. ''' jetCleanAgainstEle = [] for ijet in range(len(ak4_pt30_eta4p5_IDL)): pass_ijet_iele_ = [] for iele in range(len(ele_pt10_eta2p5_vetoID)): pass_ijet_iele_.append( ak4_pt30_eta4p5_IDL[ijet] & ele_pt10_eta2p5_vetoID[iele] & (Delta_R(ak4eta[ijet], eleeta[iele], ak4phi[ijet], elephi[iele]) > 0.4)) print "pass_ijet_iele_ = ", pass_ijet_iele_ ## if the number of true is equal to length of vector then it is ok to keep this jet, otherwise this is not cleaned jetCleanAgainstEle.append( len(WhereIsTrue(pass_ijet_iele_)) == len(pass_ijet_iele_)) print "jetCleanAgainstEle = ", jetCleanAgainstEle jetCleanAgainstMu = [] for ijet in range(len(ak4_pt30_eta4p5_IDL)): pass_ijet_imu_ = [] for imu in range(len(mu_pt10_eta2p4_looseID)): pass_ijet_imu_.append( ak4_pt30_eta4p5_IDL[ijet] & mu_pt10_eta2p4_looseID[imu] & (Delta_R(ak4eta[ijet], mueta[imu], ak4phi[ijet], muphi[imu]) > 0.4)) ## if the number of true is equal to length of vector then it is ok to keep this jet, otherwise this is not cleaned print "pass_ijet_imu_ = ", pass_ijet_imu_ jetCleanAgainstMu.append( len(WhereIsTrue(pass_ijet_imu_)) == len(pass_ijet_imu_)) print "jetCleanAgainstMu = ", jetCleanAgainstMu jetCleaned = logical_AND_List2(jetCleanAgainstEle, jetCleanAgainstMu) print "jetCleaned = ", jetCleaned print "nele, nmu = ", ele_pt10_eta2p5_vetoID, mu_pt10_eta2p4_looseID pass_jet_index_cleaned = [] pass_jet_index_cleaned = WhereIsTrue(jetCleaned, 3) print "pass_jet_index_cleaned = ", pass_jet_index_cleaned ak4_bjetM_eta2p4 = [] if len(ak4_csvmedium) > 0: ak4_bjetM_eta2p4 = logical_AND_List3(ak4_csvmedium, ak4_eta2p4, jetCleaned) pass_bjetM_eta2p4_index = WhereIsTrue(ak4_bjetM_eta2p4, 1) j1idx = -1 j2idx = -1 j3idx = -1 wenu_cr = False if len(pass_ele_index) > 0: eleidx = pass_ele_index[0] wenu_cr = logical_AND([ len(ele_pt10_eta2p5_vetoID) == 1, len(pass_ele_index) == 1, nmu_ == 0, met_ > 250.0, len(pass_jet_index_cleaned) >= 3, len(pass_bjetM_eta2p4_index) == 0, mt_ele[pass_ele_index[0]] < 160., (nTau == 0) ]) if len(pass_jet_index_cleaned) >= 3: j1idx = pass_jet_index_cleaned[0] j2idx = pass_jet_index_cleaned[1] j3idx = pass_jet_index_cleaned[2] if wenu_cr: df_out = df_out.append( { 'run': run, 'lumi': lumi, 'event': event, 'MET': met_, 'MT': mt_ele[pass_ele_index[0]], 'Njets_PassID': len(pass_jet_index_cleaned), 'Nbjets_PassID': len(pass_bjetM_eta2p4_index), 'ElePt': elept[eleidx], 'EleEta': eleeta[eleidx], 'ElePhi': elephi[eleidx], 'Jet1Pt': ak4pt[j1idx], 'Jet1Eta': ak4eta[j1idx], 'Jet1Phi': ak4phi[j1idx], 'Jet2Pt': ak4pt[j2idx], 'Jet2Eta': ak4eta[j2idx], 'Jet2Phi': ak4phi[j2idx], 'Jet3Pt': ak4pt[j3idx], 'Jet3Eta': ak4eta[j3idx], 'Jet3Phi': ak4phi[j3idx], 'Jet1Idx': j1idx, 'Jet2Idx': j2idx, 'Jet3Idx': j3idx, 'weight': cross_section_weight }, ignore_index=True) if debug_: print "object info", wenu_cr, run, lumi, event, eleidx, elept[ eleidx], eleeta[eleidx], elephi[eleidx], j1idx, ak4pt[ j1idx], ak4eta[j1idx], ak4phi[j1idx], j2idx, ak4pt[ j2idx], ak4eta[j2idx], ak4phi[ j2idx], j3idx, ak4pt[j3idx], ak4eta[ j3idx], ak4phi[j3idx], met_, mt_ele[ pass_ele_index[0]], [ len(pass_ele_index) == 1, nmu_ == 0, met_ > 250.0, len(pass_jet_index) >= 3, len(pass_bjetM_eta2p4_index) == 0, mt_ele[pass_ele_index[0]] < 160. ] ''' W mu nu CR ''' j1idx = -1 j2idx = -1 j3idx = -1 wmunu_cr = False if len(pass_mu_index) > 0: muidx = pass_mu_index[0] wmunu_cr = logical_AND([ len(mu_pt10_eta2p4_looseID) == 1, len(pass_mu_index) == 1, nele_ == 0, met_ > 250.0, len(pass_jet_index_cleaned) >= 3, len(pass_bjetM_eta2p4_index) == 0, mt_mu[muidx] < 160., (nTau == 0) ]) if len(pass_jet_index_cleaned) >= 3: j1idx = pass_jet_index_cleaned[0] j2idx = pass_jet_index_cleaned[1] j3idx = pass_jet_index_cleaned[2] print "object info mu ", wmunu_cr, run, lumi, event, mupt[ muidx], mueta[muidx], muphi[muidx], j1idx, ak4pt[ j1idx], ak4eta[j1idx], ak4phi[j1idx], j2idx, ak4pt[ j2idx], ak4eta[j2idx], ak4phi[j2idx], j3idx, ak4pt[ j3idx], ak4eta[j3idx], ak4phi[ j3idx], met_, mt_mu[pass_mu_index[0]], [ len(pass_mu_index) == 1, nmu_ == 1, met_ > 250.0, len(pass_jet_index) >= 3, len(pass_bjetM_eta2p4_index) == 0, mt_mu[pass_mu_index[0]] < 160. ] if wmunu_cr: df_out_wmunu_cr = df_out_wmunu_cr.append( { 'run': run, 'lumi': lumi, 'event': event, 'MET': met_, 'MT': mt_mu[muidx], 'Njets_PassID': len(pass_jet_index_cleaned), 'Nbjets_PassID': len(pass_bjetM_eta2p4_index), 'Jet1Pt': ak4pt[j1idx], 'Jet1Eta': ak4eta[j1idx], 'Jet1Phi': ak4phi[j1idx], 'Jet2Pt': ak4pt[j2idx], 'Jet2Eta': ak4eta[j2idx], 'Jet2Phi': ak4phi[j2idx], 'Jet3Pt': ak4pt[j3idx], 'Jet3Eta': ak4eta[j3idx], 'Jet3Phi': ak4phi[j3idx], 'Jet1Idx': j1idx, 'Jet2Idx': j2idx, 'Jet3Idx': j3idx, 'MuPt': mupt[muidx], 'MuEta': mueta[muidx], 'MuPhi': muphi[muidx], 'weight': cross_section_weight }, ignore_index=True) df_all = concat([df_all, df]) if debug_: print df_out outputfilename = args.outputfile df_out.to_root(outputfilename, key='t_dm_wenucr') df_out_wmunu_cr.to_root(outputfilename, key='t_dm_wmunucr', mode='a') end = time.clock() print "%.4gs" % (end - start)
precisiondf = DataFrame() f1df= DataFrame() recalldf = DataFrame() accuracydf= DataFrame() for i in range(100): np.random.seed(i) random_label = np.random.randint(0,4,size=len(test_set_true_label)) precision = precision_score(test_set_true_label, random_label,labels=[0,1,2,3], average=None) recall = recall_score(test_set_true_label, random_label, labels=[0,1,2,3],average=None) f1 = f1_score(test_set_true_label, random_label, labels=[0,1,2,3],average=None) random_confusion_matrix = confusion_matrix(test_set_true_label, random_label, labels=[0, 1, 2, 3]) random_confusion_matrix = random_confusion_matrix.astype('float') / random_confusion_matrix.sum(axis=1)[:, np.newaxis] accuracy = random_confusion_matrix.diagonal() precisiondf = precisiondf.append(pd.Series(precision), ignore_index=True) recalldf = recalldf.append(pd.Series(recall), ignore_index=True) f1df = f1df.append(pd.Series(f1), ignore_index=True) accuracydf = accuracydf.append(pd.Series(accuracy), ignore_index=True) print("Accuracy Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(accuracydf[0])), mean(list(accuracydf[1])), mean(list(accuracydf[2])), mean(list(accuracydf[3])) )) print("Precision Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(precisiondf[0])), mean(list(precisiondf[1])), mean(list(precisiondf[2])), mean(list(precisiondf[3])) )) print("Recall Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(recalldf[0])), mean(list(recalldf[1])), mean(list(recalldf[2])), mean(list(recalldf[3])) )) print("F1 Score for Class 0: {}, Class 1: {}, Class 2: {}, Class 3: {}".format( mean(list(f1df[0])), mean(list(f1df[1])), mean(list(f1df[2])), mean(list(f1df[3])) )) print() print("Accuracy macro average: {}".format((mean(list(accuracydf[0]))+ mean(list(accuracydf[1]))+ mean(list(accuracydf[2]))+mean(list(accuracydf[3])))/4)) print("Precision macro average: {}".format((mean(list(precisiondf[0]))+ mean(list(precisiondf[1]))+ mean(list(precisiondf[2]))+mean(list(precisiondf[3])))/4)) print("Recall macro average: {}".format((mean(list(recalldf[0]))+ mean(list(recalldf[1]))+ mean(list(recalldf[2]))+mean(list(recalldf[3])))/4)) print("F1 macro average: {}".format((mean(list(f1df[0]))+ mean(list(f1df[1]))+ mean(list(f1df[2]))+mean(list(f1df[3])))/4)) test_frequency_percentage = test_set.groupby(test_set['Label']).size()/len(test_set)*100 print('Test Set Relative Label Frequency (%)')
# 다 정제가 되었으면 이제 내림차순으로 각각 store를 행별로 정렬 관련된 상위 6개만 일단 뽑아옴(6개 추천해준다는 뜻임) second = first[index].sort_values(ascending=False).head(6) sql = "select email from tripmall_db.user where id=" + str((index + 1)) cursor.execute(sql) result = cursor.fetchall() selected = result[0].get('email') for index2 in range(0, first[0].size): # if문은 print 찍어 보면 keys()랑 values 부분 이해가 갈거임 sql = "select email from tripmall_db.user where id=" + str( (second.keys()[index2] + 1)) cursor.execute(sql) result = cursor.fetchall() recommended = result[0].get('email') if selected != recommended: new_row = { 'selected_user': selected, 'recommended_user': recommended, 'similarity': second.values[index2] } recommendAuto = recommendAuto.append(new_row, ignore_index=True) # print(recommendAuto) engine = create_engine("mysql+mysqldb://root:" + "adminssafy" + "@localhost:3306/tripmall_db", encoding='utf-8') conn = engine.connect() recommendAuto.to_sql(name='usersimilaritybased', con=engine, if_exists='append') print("finish")
"title", "url", "points", "num_comments", "author", "created_at_i", "objectID" ] i = 0 while True: try: url = 'https://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=%s&numericFilters=created_at_i<%s' % ( hitsPerPage, ts) req = urllib2.Request(url) response = urllib2.urlopen(req) data = json.loads(response.read()) last = data["nbHits"] < hitsPerPage data = DataFrame(data["hits"])[requested_keys] df = df.append(data, ignore_index=True) ts = data.created_at_i.min() print i if (last): break time.sleep(3.6) i += 1 except Exception, e: print e df["title"] = df["title"].map(lambda x: x.translate( dict.fromkeys([ 0x201c, 0x201d, 0x2011, 0x2013, 0x2014, 0x2018, 0x2019, 0x2026, 0x2032 ])).encode('utf-8').replace(',', '')) df["created_at"] = df["created_at_i"].map(
def main(): """ Main function of the program. This function makes calls to other functions. :return: None """ files = [ 'data/enron_with_categories/1', 'data/enron_with_categories/2', 'data/enron_with_categories/3', 'data/enron_with_categories/4', 'data/enron_with_categories/5', 'data/enron_with_categories/6', 'data/enron_with_categories/7', 'data/enron_with_categories/8', ] data = DataFrame({'text': [], 'class': []}) # create data frame for path in files: data = data.append(build_data_frame(path)) # The classes as mentioned in the categories.txt file, # we ignore the third parameter in the cats file classes = ("1,1", "1,2", "1,3", "1,4", "1,5", "1,6", "1,7", "1,8", "2,1", "2,2", "2,3", "2,4", "2,5", "2,6", "2,7", "2,8", "2,9", "2,10", "2,11", "2,12", "2,13", "3,1", "3,2", "3,3", "3,4", "3,5", "3,6", "3,7", "3,8", "3,9", "3,10", "3,11", "3,12", "3,13", "4,1", "4,2", "4,3", "4,4", "4,5", "4,6", "4,7", "4,8", "4,9", "4,10", "4,11", "4,12", "4,13", "4,14", "4,15", "4,16", "4,17", "4,18", "4,19") # Create pipelines for each combination of text_extraction and classifier pipeline = Pipeline([('text_extraction', CountVectorizer(ngram_range=(2, 2))), ('classifier', OneVsRestClassifier(MultinomialNB()))]) print("text_extraction: ", "CountVectorizer", "classifier:", "MultinomialNB") classify(pipeline, classes, data) pipeline = Pipeline([('text_extraction', CountVectorizer(ngram_range=(1, 2))), ('classifier', OneVsRestClassifier(LinearSVC()))]) print("text_extraction: ", "CountVectorizer", "classifier:", "LinearSVC") classify(pipeline, classes, data) pipeline = Pipeline([ ('text_extraction', CountVectorizer(ngram_range=(1, 2))), ('classifier', OneVsRestClassifier(KNeighborsClassifier())) ]) print("text_extraction: ", "CountVectorizer", "classifier:", "KNeighborsClassifier") classify(pipeline, classes, data) pipeline = Pipeline([('text_extraction', TfidfVectorizer(ngram_range=(1, 2))), ('classifier', OneVsRestClassifier(MultinomialNB()))]) print("text_extraction: ", "TfidfVectorizer", "classifier:", "MultinomialNB") classify(pipeline, classes, data) pipeline = Pipeline([('text_extraction', TfidfVectorizer(ngram_range=(1, 2))), ('classifier', OneVsRestClassifier(LinearSVC()))]) print("text_extraction: ", "TfidfVectorizer", "classifier:", "LinearSVC") classify(pipeline, classes, data) pipeline = Pipeline([ ('text_extraction', TfidfVectorizer(ngram_range=(1, 2))), ('classifier', OneVsRestClassifier(KNeighborsClassifier())) ]) print("text_extraction: ", "TfidfVectorizer", "classifier:", "KNeighborsClassifier") classify(pipeline, classes, data)
def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=['foo', 'bar', 'baz', 'qux']) series = df.loc[4] with tm.assert_raises_regex(ValueError, 'Indexes have overlapping values'): df.append(series, verify_integrity=True) series.name = None with tm.assert_raises_regex( TypeError, 'Can only append a Series if ' 'ignore_index=True'): df.append(series, verify_integrity=True) result = df.append(series[::-1], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1] }, index=df.columns).T, ignore_index=True) assert_frame_equal(result, expected) # dict result = df.append(series.to_dict(), ignore_index=True) assert_frame_equal(result, expected) result = df.append(series[::-1][:3], ignore_index=True) expected = df.append(DataFrame({ 0: series[::-1][:3] }).T, ignore_index=True, sort=True) assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set row = df.loc[4] row.name = 5 result = df.append(row) expected = df.append(df[-1:], ignore_index=True) assert_frame_equal(result, expected)
set_index() 重新设置某列为索引 DataFrame.set_index(keys, drop = True, append = False, inplace = False) append为True保留原索引加新索引 drop为False保留被作为索引的列 inplace为true在原数据集上修改 reset_index还原索引,使索引变为默认的整型索引 df.reset_index(level = None, drop = False, inplace = False, col_level = 0, col_fill='') 合并结构相同的两个数据框 pandas.concat([dataFrame1, dataFrame2,...]) df.append(df2, ignore_index = True) #df2追加到df上 pandas.concat([df1, df2], ignore_index = True) #后者表示index即可顺延 同一个数据框中不同列进行合并 X = x1 + x2 +... 不同结构的数据框按照一定的条件进行匹配合并 表一:姓名、学号。表二:学号,导师 merge(x, y, left_on, right_on)#后二参数表示两个表中用于匹配的列 离差标准化:min-max标准化 X* = (x - min)/(max - min) Z-score标准化 X* = (x - μ)/σ
def dataFrameFromDirectory(path, classification): rows = [] index = [] for filename, message in readFiles(path): rows.append({'message': message, 'class': classification}) index.append(filename) return DataFrame(rows, index=index) data = DataFrame({'message': [], 'class': []}) data = data.append(dataFrameFromDirectory( 'C:/Users/Barath Tirumala/Desktop/DSC/DataScience/DataScience-Python3/emails/spam', 'spam'), sort=True) data = data.append(dataFrameFromDirectory( 'C:/Users/Barath Tirumala/Desktop/DSC/DataScience/DataScience-Python3/emails/ham', 'ham'), sort=True) vectorizer = CountVectorizer() counts = vectorizer.fit_transform(data['message'].values) classifier = MultinomialNB() targets = data['class'].values classifier.fit(counts, targets) examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"] example_counts = vectorizer.transform(examples)
def getPRChangeTriggerData(owner, repo): """ 根据 ALL_{repo}_data_prtimeline.tsv 获取pr change_trigger数据 """ AsyncApiHelper.setRepo(owner, repo) """PRTimeLine表头""" PR_CHANGE_TRIGGER_COLUMNS = ["pullrequest_node", "user_login", "comment_node", "comment_type", "change_trigger", "filepath"] """初始化目标文件""" target_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_pr_change_trigger.tsv' target_content = DataFrame(columns=PR_CHANGE_TRIGGER_COLUMNS) # pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, # header=pandasHelper.INT_WRITE_WITH_HEADER) """读取PRTimeline,获取需要分析change_trigger的pr列表""" pr_timeline_filename = projectConfig.getPRTimeLineDataPath() + os.sep + f'ALL_{repo}_data_prtimeline.tsv' pr_timeline_df = pandasHelper.readTSVFile(fileName=pr_timeline_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """读取PullRequestData,获取pr所对应的作者""" pr_data_filename = projectConfig.getPullRequestPath() + os.sep + f'ALL_{repo}_data_pullrequest.tsv' pr_data_df = pandasHelper.readTSVFile(fileName=pr_data_filename, header=pandasHelper.INT_READ_FILE_WITH_HEAD) """收集pr已经对应的作者 用于后面过滤属于作者评论""" pr_author_map = {} for index, row in pr_data_df.iterrows(): pr_author_map[row['node_id']] = row['user_login'] pr_nodes = list(set(list(pr_timeline_df['pullrequest_node']))) pr_nodes.sort() # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjE5MjEzOTc5'] # 3次reopend # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MjA0MTk5ODkw'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDQwOTAxMzk0'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MzE1OTU0NDgw'] # pr外review # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTQ3NDczNTIx'] # 普通用例 # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0NDM4NjAzMjk2'] # 超多review # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0Mjg1NzExNTIx'] # pr_nodes = ['MDExOlB1bGxSZXF1ZXN0MTAxNTUwMTcw'] """设置fetch参数""" pos = 0 fetchLimit = 400 size = pr_nodes.__len__() Logger.logi("there are {0} prs need to analyze".format(pr_nodes.__len__())) t1 = datetime.now() while pos < size: print("now:", pos, ' total:', size, 'cost time:', datetime.now() - t1) Logger.logi("start: {0}, end: {1}, all: {2}".format(pos, pos + fetchLimit, size)) """按照爬取限制取子集""" sub_prs = pr_nodes[pos:pos + fetchLimit] pr_timeline_items = pr_timeline_df[pr_timeline_df['pullrequest_node'].isin(sub_prs)] """对子集按照pull_request_node分组""" grouped_timeline = pr_timeline_items.groupby((['pullrequest_node'])) """将分组结果保存为字典{pr->pr_timeline_items}""" formated_data = [] for pr, group in grouped_timeline: record = group.to_dict(orient='records') record = sorted(record, key=lambda x: int(x.get(StringKeyUtils.STR_KEY_POSITION))) formated_data.append(record) """分析这些pr的timeline""" pr_change_trigger_comments = AsyncProjectAllDataFetcher.analyzePullRequestReview(formated_data, pr_author_map) pr_change_trigger_comments = [x for y in pr_change_trigger_comments for x in y] """将分析结果去重并追加到change_trigger表中""" if pr_change_trigger_comments.__len__() > 0: target_content = DataFrame() target_content = target_content.append(pr_change_trigger_comments, ignore_index=True) target_content = target_content[PR_CHANGE_TRIGGER_COLUMNS].copy(deep=True) target_content.drop_duplicates(subset=['pullrequest_node', 'comment_node'], inplace=True, keep='first') if not target_content.empty: pandasHelper.writeTSVFile(target_filename, target_content, pandasHelper.STR_WRITE_STYLE_APPEND_NEW, header=pandasHelper.INT_WRITE_WITHOUT_HEADER) Logger.logi("successfully analyzed {0} prs".format(pos)) pos += fetchLimit