def get_link(url): link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>') links = [] # open web content f = urllib2.urlopen(url) content = f.read() # versi find html tag : find all url and save to links # soup = BeautifulSoup(content, "lxml") # for a in soup.find_all('a', href=True): # if "detik.com" in a['href']: # if "http:" not in a['href']: # a['href'] = "http:" + a['href'] # print "Found the URL:", a['href'] # links.append(a['href']) # versi regex : find all url and save to links for link in link_exr.findall(content): if "detik.com" in link[0]: link_detik = link[0] if "http:" not in link_detik: link_detik = "http:" + link_detik links.append(link_detik) # save to DataFrame df = DataFrame(links, columns=['detik url']) df.drop_duplicates() print df.head(0) # create and save to sqlite database detik_db = create_engine("mysql://*****:*****@localhost/data_detik") df.to_sql('url_detik', detik_db, if_exists='replace')
def post(self): post = json.loads(self.request.body) MyClient = riak.RiakClient(protocol=RIAK_PROTOCOL, http_port=RIAK_HTTP_PORT, host=RIAK_HOST) MyAdminBucket = MyClient.bucket(ADMIN_BUCKET_NAME) connection = None for c in MyAdminBucket.get('connection').data: if c['slug'] == post.get('connection', None): connection = c['connection'] sql = """SELECT * FROM ({}) AS CUBE LIMIT 10;""".format( post.get('sql', None)) e = create_engine(connection) connection = e.connect() try: resoverall = connection.execute(text(sql)) except: self.write({'sql': '', 'msg': 'Error!'}) self.finish() df = DataFrame(resoverall.fetchall()) if df.empty: self.finish() df.columns = resoverall.keys() df.head() self.write({'sql': df.to_json(orient='records'), 'msg': 'Success!'}) self.finish()
def run(cube_slug=None): mc = memcache.Client(['127.0.0.1:11211'], debug=0) for cube in MyAdminBucket.get('cube').data: try: slug = cube['slug'] if cube_slug and cube_slug != slug: continue sql = """SELECT * FROM ({}) AS CUBE;""".format(cube['sql']) for c in MyAdminBucket.get('connection').data: if c['slug'] == cube['connection']: connection = c['connection'] print "\n# CLEAN MEMCACHE/RIAK: {}".format(slug) mc.delete(str(slug)) mc.delete(str('{}-columns'.format(slug))) MyBucket.new(slug, data='').store() MyBucket.new(u'{}-columns'.format(slug), data='').store() MyBucket.new(u'{}-connect'.format(slug), data='').store() MyBucket.new(u'{}-sql'.format(slug), data='').store() print "# CONNECT IN RELATION DATA BASE: {}".format(slug) e = create_engine(connection) connection = e.connect() resoverall = connection.execute(text(sql)) print "# LOAD DATA ON DATAWAREHOUSE: {}".format(slug) df = DataFrame(resoverall.fetchall()) if df.empty: print '[warnning]Empty cube: {}!!'.format(cube) return df.columns = resoverall.keys() df.head() pdict = map(fix_render, df.to_dict(outtype='records')) print "# SAVE DATA (JSON) ON RIAK: {}".format(slug) MyBucket.new(slug, data=pdict).store() print "# SAVE COLUMNS ON RIAK: {}".format(slug) MyBucket.new(u'{}-columns'.format(slug), data=json.dumps([c for c in df.columns])).store() print "# SAVE CONNECT ON RIAK: {}".format(slug) MyBucket.new(u'{}-connect'.format(slug), data=c).store() print "# SAVE SQL ON RIAK: {}".format(slug) MyBucket.new(u'{}-sql'.format(slug), data=sql).store() print "# CLEAN MEMORY: {}\n".format(slug) del pdict, df gc.collect() except: pass print "## FINISH" return True
def test_setitem_chained_setfault(self): # GH6026 # setfaults under numpy 1.7.1 (ok on 1.8) data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout'] mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none'] df = DataFrame({'response': np.array(data)}) mask = df.response == 'timeout' df.response[mask] = 'none' tm.assert_frame_equal(df, DataFrame({'response': mdata})) recarray = np.rec.fromarrays([data], names=['response']) df = DataFrame(recarray) mask = df.response == 'timeout' df.response[mask] = 'none' tm.assert_frame_equal(df, DataFrame({'response': mdata})) df = DataFrame({'response': data, 'response1': data}) mask = df.response == 'timeout' df.response[mask] = 'none' tm.assert_frame_equal(df, DataFrame({'response': mdata, 'response1': data})) # GH 6056 expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) df['A'].iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) df.A.iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected)
class MetrilyxAnalyticsSerie(MetrilyxSerie, BasicAnalyticsSerie): def __init__(self, serie, graphType="line", dataCallback=None): super(MetrilyxAnalyticsSerie, self).__init__(serie, dataCallback) self.graphType = graphType self._istruct = None if not self.error: self._istruct = self.__getInternalStruct() self.__applyTransform() def __getInternalStruct(self): out = [] for d in self._serie['data']: out.append((d['uuid'], Series([d['dps'][k] for k in sorted(d['dps'].keys())], index=to_datetime([int(ts) for ts in sorted(d['dps'].keys())], unit='s')))) return DataFrame(dict(out)) def __getSerieMetadata(self, serie): return dict([(k,v) for k,v in serie.items() if k != 'dps']) def data(self, ts_unit='ms'): if self.error: return { 'error': self.error } out = [] for s in self._serie['data']: md = self.__getSerieMetadata(s) logger.error("HERE %s" % (s['uuid'])) datapoints = self._getDataSerieDps(self._serie['query']['aggregator'], self._istruct[s['uuid']], ts_unit) error = self._dataHasErrors(datapoints) if not error: md['dps'] = datapoints md['uuid'] = SerieUUID(s).uuid else: #s = {'error': error} logger.warning("Error assembling data: %s" %(str(e))) out.append(md) return out def __applyTransform(self): if self._serie['yTransform'] != "": try: self._istruct = eval("%s" %(self._serie['yTransform']))(self._istruct) if isinstance(self._istruct, Series): logger.error("Need to handle 'Series'") logger.error(self._serie['alias']) self._istruct = DataFrame({self._serie['alias']: self._istruct}) print self._istruct.head() except Exception,e: logger.warn("Could not apply yTransform: %s" %(str(e)))
def main(): loc_char_count = create_data_array() # print loc_char_count count_sum = loc_char_count.sum(axis=0) # print count_sum count_sum[count_sum==0] = 1 # print count_sum test = loc_char_count/count_sum df= DataFrame(test) df.to_csv('./data/char_loc_freq.csv') print df.head(4) print df.ix(3)[2]
def getPostData(fbGraph, entry): global CHART_LIMIT retrieved = False i=0 while retrieved == False: i += 1 try: posts = fbGraph.get_object(entry['page'] + '/posts', limit=CHART_LIMIT*15)['data'] retrieved = True except facebook.GraphAPIError: print "Failed retrieving Graph object from facebook, retrying..." pass if i > 14: print "Giving up" return None frame = DataFrame(posts) ##Later, maybe output this frame for further study postData = DataFrame(columns=('Date', 'Likes', 'Shares')) postData['Shares'] = frame['shares'].map(fmtShares) postData['Likes'] = frame['id'].map(fmtLikes) postData['Date'] = frame['created_time'].map(fmtDate) postData = postData.groupby(by='Date', sort=False).mean() postData = postData.head(n=CHART_LIMIT) postData.fillna(value=0) return postData
def gonzales(data , k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1,k+1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index ]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1 ,inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0 ,1])
def setup(self, axis): N = 1000 s = Series(N, index=tm.makeStringIndex(N)) self.series = [s[i:- i] for i in range(1, 10)] * 50 self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 df = DataFrame({'A': range(N)}, index=date_range('20130101', periods=N, freq='s')) self.empty_left = [DataFrame(), df] self.empty_right = [df, DataFrame()] self.mixed_ndims = [df, df.head(N // 2)]
def select_best(clstruct, scorenames=['sensitivity','mmr','aupr','cliqueness_3_20','nonov_iter','n_proteins','n_complexes_3_20'], rfunc=operator.add, use_norm=False, dispn=15, score_factors=None, use_ranks=True, output_ranks=False, print_ranks=False, require_scores=None): cxstructs, stats = clstruct.cxstructs, clstruct.stats clusts = [cxstr.cxs for cxstr in cxstructs] scorenames = scorenames or list(stats.dtype.names) stats = stats[scorenames] ranks = rank_columns(stats) if use_ranks: stats = ranks else: if use_norm: stats = norm_columns(stats) if score_factors: stats = rescale_columns(stats, score_factors) inds = np.argsort(reduce(rfunc, [stats[n] for n in scorenames]))[::-1] if require_scores is not None: for req_name,thresh in require_scores: thresh = (np.median(clstruct.stats[req_name]) if thresh is None else thresh) inds = [i for i in inds if clstruct.stats[req_name][i] > thresh] nstats = len(stats) def filt_params(s): return " ".join([p[:2]+p.split('=')[1] for p in s.split(',')]) show_columns = (scorenames if require_scores is None else scorenames+ut.i0(require_scores)) d = DataFrame(clstruct.stats[inds[:dispn]][show_columns], index=["#%s: %sc %si %s" % (i,len(clusts[i]),len(cxstructs[i].cxppis), filt_params(cxstructs[i].params)) for i in inds[:dispn]]) print d.head(dispn) for i in inds[:dispn]: #print (i, ["%0.4f " % s for s in clstruct.stats[i]], len(clusts[i]), #len(cxstructs[i].cxppis), cxstructs[i].params) if print_ranks: print i, [nstats-s for s in ranks[i]] if output_ranks: return inds else: return clusts[inds[0]], cxstructs[inds[0]].cxppis, inds[0]
def download_data(order_set,save_set): for i in range(len(table_name_set)): cursor.execute(order_set[i]) rows = cursor.fetchall() #print(len(rows)) #print(cursor.description) import numpy as np rows = np.array(rows).reshape(len(rows),len(rows[0])) df = DataFrame(rows,columns=[i[0] for i in cursor.description]) #print([i[0] for i in cursor.description]) df.to_csv(save_set[i],index=None,encoding='GB2312') print(df.head()) cnxn.commit()
class TestPerfectData(unittest.TestCase): """what happens with nice data""" def setUp(self): index = date_range('1/1/2015', periods=365) self.df = DataFrame(list(range(len(index))), index=index, columns=['value']) self.dataset = Dataset(self.df, 60*60*24, cumulative=False) def test_validates(self): self.assertTrue(self.dataset.validate()) def test_partial_validates(self): """cut the data up and it still works""" d = Dataset(self.df.head(100), 60*60*24, cumulative=False) self.assertTrue(d.validate()) def test_short_raises(self): """single value datasets raise an error""" d = Dataset(self.df.head(1), 60*60*24, cumulative=False) self.assertRaises(ShortDatasetError, d.validate) def test_interpolate_skipped(self): d2 = self.dataset.interpolate() self.assertEqual(self.dataset, d2)
def make_submission(path, params, threshold_ratio): X_train, w_train, y_train = load_training_data() indexes_test, X_test = load_test_data() y_out = fit_predict(X_train, w_train, y_train, X_test, params) y_pred, rank = get_y_pred_rank(y_out, threshold_ratio) submission = DataFrame({'EventId': indexes_test, 'RankOrder': rank, 'Class': y_pred}, columns=['EventId', 'RankOrder', 'Class']) submission['Class'] = submission['Class'].apply(lambda x: 's' if x else 'b') submission.to_csv(path, index=False) print('--------------------- Submission') print(submission.head()) print(path) return submission
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): if (self._args is not None and len(self._args) > 2) or \ (len(self._args) != 0 and not isinstance(self._args[0], QueryFunction)): raise ValueError('Invalid argument to absolute value function') # get the data data = input if len(self._args) == 0 else self._args[0].process(start_time, end_time, input) ret = None # go through each column, get the average, and apply it to the rows for col in data.columns: abs = data[col].abs() # get the absolute value for each value in the column abs.name = 'abs ' + col # update the name if ret is None: ret = DataFrame(abs) else: ret = ret.combine_first(DataFrame(abs)) # add it to our return value print(ret.head()) return ret
def process(self, start_time: datetime, end_time: datetime, input:DataFrame): if str(self.name) not in '+-*/': raise ValueError("Unknown math function: " + str(self.name)) ret = DataFrame() # two args means we're doing A + B if len(self._args) == 2: left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0] right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1] for l_col in left.columns: for r_col in right.columns: if self.name == '+': t = left[l_col] + right[r_col] elif self.name == '-': t = left[l_col] - right[r_col] elif self.name == '*': t = left[l_col] * right[r_col] elif self.name == '/': t = left[l_col] / right[r_col] else: raise ValueError("Unknown operator: " + str(self.name)) t = DataFrame(t) t.columns = [l_col + self.name + r_col] print(left.head()) print(right.head()) print(t.head()) ret = ret.combine_first(t) else: # everything is in the input DataFrame ret = DataFrame(input.sum(axis=0)) ret.columns = [' + '.join(input.columns)] return ret
# In[11]: #Calculating Moving averages for Gold ma = DataFrame(values['Date'],columns=['Date']) ma['Date']=pd.to_datetime(ma['Date'],format='%Y-%b-%d') ma['15SMA'] = (values['Gold']/(values['Gold'].rolling(window=15).mean()))-1 ma['30SMA'] = (values['Gold']/(values['Gold'].rolling(window=30).mean()))-1 ma['60SMA'] = (values['Gold']/(values['Gold'].rolling(window=60).mean()))-1 ma['90SMA'] = (values['Gold']/(values['Gold'].rolling(window=90).mean()))-1 ma['180SMA'] = (values['Gold']/(values['Gold'].rolling(window=180).mean()))-1 ma['90EMA'] = (values['Gold']/(values['Gold'].ewm(span=90,adjust=True,ignore_na=True).mean()))-1 ma['180EMA'] = (values['Gold']/(values['Gold'].ewm(span=180,adjust=True,ignore_na=True).mean()))-1 ma = ma.dropna(axis=0) print(ma.shape) ma.head() # In[12]: #Merging Moving Average values to the feature space print(data.shape) data['Date']=pd.to_datetime(data['Date'],format='%Y-%b-%d') data = pd.merge(left=data,right=ma,how='left',on='Date') print(data.shape) data.isna().sum() # This wall all about features. Now we need to create targets, i.e what we want to predict. Since we are predicting returns, we need to pick a horizon for which we need to predict returns. I have chosen 14-day and 22-day horizons because other smaller horizons tend to be very volatile and lack and predictive power. One can however, experiment with other horizons as well. #
def file_load(self, datatype): ''' sales_dict和ads_dict 表示国家对应的广告数据和销售数据的文件目录 datatype= True,打开广告数据. False, 打开销售数据 start, end传入时间,可为None。目前暂时用于读取销售数据用。 读销售数据原理:用os.listdir找到数据月份文件夹(如:2017.03), 根据时间段与文件匹配,读取该时间段内的数据。 函数返回DataFrame对象 ''' ads_dict = { 'SXDE': '/data/SX/EU/Ads/DE/ads report/', 'SXES': '/data/SX/EU/Ads/ES/ads report/', 'SXFR': '/data/SX/EU/Ads/FR/ads report/', 'SXIT': '/data/SX/EU/Ads/IT/ads report/', 'SXUK': '/data/SX/EU/Ads/UK/ads report/', 'SXJP': '/data/SX/Japan/Ads/', 'SXCA': '/data/SX/North America/Ads/CA/ads report/', 'SXUS': '/data/SX/North America/Ads/USA/ads report/', 'HYYDE': '/data/HYY/EU/ads/DE/', 'HYYES': '/data/HYY/EU/ads/ES/', 'HYYFR': '/data/HYY/EU/ads/FR/', 'HYYIT': '/data/HYY/EU/ads/IT/', 'HYYUK': '/data/HYY/EU/ads/UK/', 'HYYJP': '/data/HYY/Japan/Ads/', 'HYYUS': '/data/HYY/North America/ads/USA/ads report/', 'TXHLDE': '/data/TXHL/EU/ads/DE/', 'TXHLES': '/data/TXHL/EU/ads/ES/', 'TXHLFR': '/data/TXHL/EU/ads/FR/', 'TXHLIT': '/data/TXHL/EU/ads/IT/', 'TXHLUK': '/data/TXHL/EU/ads/UK/', 'TXHLJP': '/data/TXHL/Japan/ads/', 'TXHLCA': '', 'TXHLUS': '', } sales_dict = { 'SXDE': '/data/SX/EU/business report/DE/', 'SXES': '/data/SX/EU/business report/ES/', 'SXFR': '/data/SX/EU/business report/FR/', 'SXIT': '/data/SX/EU/business report/IT/', 'SXUK': '/data/SX/EU/business report/UK/', 'SXJP': '/data/SX/Japan/business report/', 'SXCA': '/data/SX/North America/business report/CA/', 'SXUS': '/data/SX/North America/business report/USA/', 'HYYDE': '/data/HYY/EU/business report/DE/', 'HYYES': '/data/HYY/EU/business report/ES/', 'HYYFR': '/data/HYY/EU/business report/FR/', 'HYYIT': '/data/HYY/EU/business report/IT/', 'HYYUK': '/data/HYY/EU/business report/UK/', 'HYYJP': '/data/HYY/Japan/business report/', 'HYYCA': '/data/HYY/North America/business report/CA/', 'HYYUS': '/data/HYY/North America/business report/USA/', 'TXHLDE': '/data/TXHL/EU/business report/DE/', 'TXHLES': '/data/TXHL/EU/business report/ES/', 'TXHLFR': '/data/TXHL/EU/business report/FR/', 'TXHLIT': '/data/TXHL/EU/business report/IT/', 'TXHLUK': '/data/TXHL/EU/business report/UK/', 'TXHLJP': '/data/TXHL/Japan/business report/', 'TXHLCA': '', 'TXHLUS': '', } if datatype: ad_campaign = DataFrame() path = 'F:/PycharmFile'+ ads_dict[self.store + self.country] # 广告数据文件地址 file_fold = self.end.strftime('%Y') + '.' + self.end.strftime('%m') # 需修改: 直接写出文件夹名,文件名file_name,如果存在,则打开文件,不存在,则查找 if os.path.isdir(path + file_fold): # 找到月份文件夹 file_name = "ADs_" + self.store + self.country + "_" + str(self.end.year) + "-" \ + str(self.end.month) + "-" + str(self.end.day) + ".txt" if self.country == "JP": ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='Shift-JIS') else: ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='utf-8') return ad_campaign else: sales_df = DataFrame() path = 'F:/PycharmFile' + sales_dict[self.store + self.country] #销售数据文件地址 delta = (self.end - self.start).days for i in range(delta+1): date = (self.start + timedelta(days=i)) file_name = self.store + self.country + '-' + date.strftime('%y') + '-' + str(date.month)\ + '-' + str(date.day) + '.csv' for root, subdirs, files, in os.walk(path): for name in files: if name == file_name: print name file_path = root + '/' + name df = pd.read_csv(file_path, encoding='utf8') sales_df = pd.concat([sales_df, df]) sales_df = sales_df[[u'(子)ASIN', u'商品名称', u'买家访问次数', u'买家访问次数百分比',u'页面浏览次数', u'页面浏览次数百分比', u'购买按钮页面浏览率', u'已订购商品数量', u'订单商品数量转化率', u'已订购商品销售额', u'订单商品种类数']] print sales_df.head() return sales_df
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { 'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( { 'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c'] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( { 'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan] }, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame( { 'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan] }, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): dfnu.loc[['E']] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame({'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected)
def create_feature_map(features): outfile = open('xgb.fmap', 'w') i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) i = i + 1 outfile.close() create_feature_map(feature_name) import operator xgb_importance = xgb_model.get_fscore(fmap='xgb.fmap') xgb_importance = sorted(xgb_importance.items(), key=operator.itemgetter(1)) xgb_importance = DF(xgb_importance, columns=['name', 'fscore']) print(xgb_importance) online_xgb_set = xgb.DMatrix(online_train[feature_name],label=online_train['label']) online_xgb_model = xgb.train(params,online_xgb_set,num_boost_round=xgb_model.best_iteration) ans_xgb = online_xgb_model.predict(xgb.DMatrix(online_data[feature_name])) submit_xgb = DF() submit_xgb['id'] = online_data['user_id'] from sklearn.preprocessing import MinMaxScaler st = MinMaxScaler() submit_xgb['score'] = st.fit_transform(ans_xgb.reshape(-1,1)) # RANK # submit_xgb['score'] = ans_xgb # Binary print(submit_xgb.head(10)) print(submit_xgb['score'].describe()) submit.to_csv('Submit XGB.txt',index=False,header=False)
re.findall(r'\W+', test_phrase3) # sequence of nonalphanum #Except for control characters, (+ ? . * ^ $ ( ) [ ] { } | \), all characters match themselves. # You can escape a control character by preceding it with a backslash. # In which case you should also use raw strings otherwise need double backslash #============================================================================== # CAtegoricals #============================================================================== s = pd.Series(['a', 'b', 'c', 'a'], dtype="category") #this is agreat pattern for auto creation of e.g. age ranges df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) #============================================================================== # McKinney's C2 - intro examples #============================================================================== path = '/Users/stevegoodman/Documents/Dev/pydata-book-master' import json with open(path+'/ch02/usagov_bitly_data2012-03-16-1331923249.txt','rb') as f: records = [json.loads(line) for line in f] #records is a list of dicts
def activity_forecast(activity_daily_stats: DataFrame) -> DataFrame: return activity_daily_stats.head(100)
rcParams['figure.figsize'] = 10, 8 sb.set_style('whitegrid') #Load NN Data from mat file into dict. mat=scipy.io.loadmat(r'/home/dl2020/Python/NeuralNetwork/NNData.mat') print(mat.keys()) #returns the NNData value forom dict. NNData=mat["NNData"] print(NNData.shape) #convert list to panda dataframe df=DataFrame(NNData) df.columns=['dev._stage','dimple_ang.','radii_ratio','orientation_ang.','area','force'] print(df.head(10)) #Checking for missing values print(df.isnull().sum()) #print data information print(df.info()) #Converting categorical variables to a dummy indicators stage=pd.get_dummies(df['dev._stage'],drop_first=False) print(stage.head(10)) df.drop(['dev._stage'],axis=1,inplace=True) df=pd.concat([df,stage],axis=1) print(df.head()) #rename new columns
review_data.loc[review_data["Title"] == "0", "Content"] = "0" print(review_data[review_data["Content"] == "0"].shape) # In[500]: print(review_data[review_data["Content"] != "0"].shape) # In[501]: review_data_except_purchase = DataFrame(review_data[review_data["Content"] != "0"], columns=["Source","Date","Name","Title","Content","Rating"]) print(review_data_except_purchase.shape) review_data_except_purchase.head() # In[502]: def clean_exchange_text(content): content = str(content) word1 = "환불" if word1 in content : return 0 else : return content # In[503]:
def main(): print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') print(' Plotting the classification based on Quartiles of File 04.01') print('') print('------------------------------------------------------------') print(' IMPORTANT MEMO: Dependencies:') print(' Have you run:') print(' 1) the file 01.01 to get the csv files?') print(' 2) the file 04.01 to get the classification?') print('') print('------------------------------------------------------------') print(' Reading the start, end date and the period:') input_file = '00.00.PARAMETERS.txt' f_in = open(input_file, 'r') lines = f_in.readlines( )[1:] # reads starting from the second line and stores the line in a string print(lines) # print (lines[0].split('=', 1)[0]) start_date = lines[0].split(' ', 1)[0] stop_date = lines[1].split(' ', 1)[0] period = int(lines[2].split(' ', 1)[0]) print(' Using the following:') print(' start_date (closest to us) = ' + start_date) print(' stop_date (the most in the past) = ' + stop_date) print(' period (days) = ' + str(period)) print('') if period != 30 and period != 7 and period != 1: print(' ERROR: period must be 1, 7 or 30') quit() f_in.close() start = datetime.strptime(start_date, "%Y-%m-%d") stop = datetime.strptime(stop_date, "%Y-%m-%d") day = start.strftime('%Y-%m-%d') print('') print('------------------------------------------------------------') print(' Folder of the Outputs : ') folder_RES = '04.02.RES-Quartiles-Plots-e-Graphs' print(folder_RES) # creating the folder if it does not exist if not os.path.exists(folder_RES): os.makedirs(folder_RES) #quit() print('') print(' ---------------------------------') print(' Where taking the csv file from') folder_input = '04.01.RES-Quartiles' print(folder_input) csv_file_type = folder_input print('') print(' ---------------------------------') print( ' Saving the columns x = log10 audience; y = log10 revenues , color = reach' ) # name of x, y, color to take from var_df List_name_col = { 'x': 'log_audience', 'y': 'log_sum_revenue_euro', 'color': 'reach' } print(List_name_col) while start > stop: # print (day) # start.strftime('%Y-%m-%d')) # tbl = 'Data.' + day # name of the table to be written print('') print( '/////////////////////////////////////////////////////////////////////////////////////////////////////' ) #MYFILE = csv_file_type + day + '.csv' # 'TABLE_v02.02_No-duplicate-funnel.csv' #print ('Table now (MYFILE) = ' + MYFILE) print(' +++ Date now = ' + day) name_df = folder_input + '/RES.Date-' + day + '.period-' + str( period) + '.clients_id-idxColorReach-idxYRev.01-Unsorted.csv' print(' name_df = ' + name_df) print('') print(' -------------------------------------') print(' Reading the database of 04.01 and changing names of columns') df_input = pd.read_csv(name_df, na_values=['None'], skiprows=3, skip_blank_lines=True, thousands=',', index_col=False) print(df_input.head()) #quit() print('') print(' -------------------------------------') print(' Changing the names of the columns for practical purposes') df_input.rename(columns={ 'index_' + List_name_col['color']: 'index_quartile_reach' }, inplace=True) df_input.rename(columns={ 'index_' + List_name_col['y']: 'index_quartile_log_rev_eur' }, inplace=True) print(df_input.head()) #quit() # --------------------------------------------------------------------------------------------------- print('') print('-------------------------------------') print(' Selecting the clients that are Tier1 according to Criteo') # save all coord for plots of all points x_all = df_input['log_audience'] y_all = df_input['log_sum_revenue_euro'] x_tier_1 = [] y_tier_1 = [] l_print_tier1 = False for index, row in df_input.iterrows(): if l_print_tier1: print(index), if df_input['is_tier_1'][index] == 1: if l_print_tier1: print(df_input['is_tier_1'][index]), # here we are ok x_tier_1.append(df_input['log_audience'][index]) y_tier_1.append(df_input['log_sum_revenue_euro'][index]) if l_print_tier1: print('') print('Total number of clients now = ' + str(len(x_tier_1))) print('Out of which are Tier 1 = ' + str(len(x_all))) print('') print('-------------------------------------') print(' Making a figure to with all without Tier 1') # plot 2d: # 5: Audience vs CPC & color = reach fig1 = plt.figure() plt.title('Clients Segmentation: Date ' + day + ', Period ' + str(period) + '\n number of total clients = ' + str(len(x_all))) #+ ' (' + str(int(len(x_tier_1) * 100 / len(x_all))) + '%)') ax1 = fig1.add_subplot(1, 1, 1) plt.xlabel('log_audience') plt.ylabel('log_sum_revenue_euro') plt.xlim([2, 8]) plt.ylim([-2, 6.5]) # select the x and y -> x2, y2 -> made above: ATTENTION: for this excample only # multiple series # all points ax1.scatter(x_all, y_all, c=df_input['reach'], cmap='rainbow', vmin=0.0, vmax=1.0, marker="o", label="All Points") ax1.colorbar() # points that are tier 1 #ax1.scatter(x_tier_1, y_tier_1, c='black', marker="1", label="Tier1", s=100) plt.legend(loc='upper left') plt.grid(True) subfolder_now = '/01.01.graph-All-vs-Tier1' # creating the folder if it does not exist directory = folder_RES + subfolder_now if not os.path.exists(directory): os.makedirs(directory) name_fig1 = folder_RES + subfolder_now + '/Fig.Date-' + day + '.period-' + str( period) + '.All.png' plt.savefig(name_fig1, format='png') print(' ==> Figure now = ' + name_fig1) # plt.show() # caution: it stops the flow of the program plt.draw() # draws without stopping the flow of the program plt.clf() # clear figure plt.close() # close the figure window and continue print('') print('-------------------------------------') print(' Making a figure to compare the Tier 1 vs non-tier 1') # plot 2d: # 5: Audience vs CPC & color = reach fig1 = plt.figure() plt.title('Clients Segmentation: Date ' + day + ', Period ' + str(period) + '\n number of total clients = ' + str(len(x_all)) + ', of which tier 1 = ' + str(len(x_tier_1)) + ' (' + str(int(len(x_tier_1) * 100 / len(x_all))) + '%)') ax1 = fig1.add_subplot(1, 1, 1) plt.xlabel('log_audience') plt.ylabel('log_sum_revenue_euro') plt.xlim([2, 8]) plt.ylim([-2, 6.5]) # select the x and y -> x2, y2 -> made above: ATTENTION: for this excample only # multiple series # all points ax1.scatter(x_all, y_all, c=df_input['reach'], cmap='rainbow', vmin=0.0, vmax=1.0, marker="o", label="All Points") # points that are tier 1 ax1.scatter(x_tier_1, y_tier_1, c='black', marker="1", label="Tier1", s=100) plt.legend(loc='upper left') plt.grid(True) subfolder_now = '/01.01.graph-All-vs-Tier1' # creating the folder if it does not exist directory = folder_RES + subfolder_now if not os.path.exists(directory): os.makedirs(directory) name_fig1 = folder_RES + subfolder_now + '/Fig.Date-' + day + '.period-' + str( period) + '.All-vs-Tier-1.png' plt.savefig(name_fig1, format='png') print(' ==> Figure now = ' + name_fig1) # plt.show() # caution: it stops the flow of the program plt.draw() # draws without stopping the flow of the program plt.clf() # clear figure plt.close() # close the figure window and continue #--------------------------------------------------------------------------------------------------- print('') print('-------------------------------------') print(' Plotting the Clients highlighted on the graph for each sector') # it works for 4 groups # for test purposes #coord_chosen = [df_input['log_audience'][0], df_input['log_sum_revenue_euro'][0]] #print (coord_chosen) #x_chosen = coord_chosen[0] #x2 = x_chosen #y_chosen = coord_chosen[1] #y2 = y_chosen # cycle on the points for i_reach_now in range(0, 4): # sum 1 because it starts form 1 index_reach_now = i_reach_now + 1 for i_rev_now in range(0, 4): index_rev_now = i_rev_now + 1 #print ('++++ reach, rev = ' + str(index_reach_now) + ', ' + str(index_rev_now)) # initialize the vectors # declartion of the coordinates of the points in the group x_chosen = [] y_chosen = [] l_print = False #print ('---') #print (' Analyzing line ny line for idx_reach = ' + str(index_reach_now) + ', index_log_rev = ' + str(index_rev_now)) # searching in the lines for index, row in df_input.iterrows(): if l_print: print(index), if df_input['index_quartile_reach'][ index] == index_reach_now: if l_print: print(df_input['index_quartile_reach'][index]), if df_input['index_quartile_log_rev_eur'][ index] == index_rev_now: if l_print: print(df_input['index_quartile_log_rev_eur'] [index]) # here we are ok x_chosen.append(df_input['log_audience'][index]) y_chosen.append( df_input['log_sum_revenue_euro'][index]) if l_print: print('') #print ('---') #print (' Plotting the Clients highlighted on the graph') # check #print (x_chosen) #print (y_chosen) x2 = x_chosen y2 = y_chosen number_clients = len(x_chosen) # plot 2d: # 5: Audience vs CPC & color = reach fig10 = plt.figure() plt.title('Clients Segmentation: Date ' + day + ', Period ' + str(period) + '\nIdxReach' + str(index_reach_now) + '-IdxRev' + str(index_rev_now) + '; num clients = ' + str(number_clients)) ax10 = fig10.add_subplot(1, 1, 1) plt.xlabel('log_audience') # plt.xlim([-2, 2]) # ax5.set_xscale('log') # log scale # y = clean_clust_df[POI[4]]*100 # plt.ylabel(POI[4] + '*100') # log => no x 100 plt.ylabel('log_sum_revenue_euro') plt.xlim([2, 8]) plt.ylim([-2, 6.5]) # select the x and y -> x2, y2 -> made above: ATTENTION: for this excample only # multiple series # all points ax10.scatter(x_all, y_all, c=df_input['reach'], cmap='rainbow', vmin=0.0, vmax=1.0, marker="o", label="All Points") # points in the group #ax10.scatter(x2, y2, c='black', marker="s", label="In the sector", s=100) ax10.scatter(x2, y2, facecolors='none', edgecolors='black', marker="s", label="In the sector", s=100) # points that are tier 1 ax10.scatter(x_tier_1, y_tier_1, c='black', marker="1", label="Tier1", s=100) plt.legend(loc='upper left') # plt.ylim([-2, 40]) # plt.ylim([1, 9]) # ax5.set_yscale('log') # log scale # plt.colorbar(ax.imshow(image, interpolation='nearest')) # plt.scatter(x, y, c=var_df[POI_here[2]], cmap='rainbow', vmin=0.0, vmax=1.0) # ax10.colorbar() plt.grid(True) # cbar = plt.colorbar() # cbar.set_label('Reach', rotation=270) # plt.scatter(x, y, c=clean_clust_df[POI[2]], cmap=plt.cm.bwr_r) # cmap = sns.diverging_palette(5, 250, as_cmap=True) subfolder_now = '/02.01.graph-All-vs-Groups' # creating the folder if it does not exist directory = folder_RES + subfolder_now if not os.path.exists(directory): os.makedirs(directory) name_fig10 = folder_RES + subfolder_now + '/Fig.Date-'+ day + '.period-' + str(period) +'.IdxReach' + str(index_reach_now) \ + '-IdxRev' + str(index_rev_now) + '.png' plt.savefig(name_fig10, format='png') print(' ==> Figure now = ' + name_fig10) # plt.show() # caution: it stops the flow of the program plt.draw() # draws without stopping the flow of the program plt.clf() # clear figure plt.close() # close the figure window and continue del x_chosen del y_chosen #print (' End of cycle on the rows') print('') print( '----------------------------------------------------------------------------' ) print( ' Initialization for counting of how many elements in each of the groups and how many of these are tier 1' ) num_sectors_reach = 4 # x num_sectors_log_rev = 4 # y # number of clients in each sector: first index = x = reach, second index = y = log_rev num_client_in_group = [[0] * num_sectors_log_rev for x in xrange(num_sectors_reach)] print(' Check: must be 4x4 null: '), print(num_client_in_group) # number of clients in each sector that are tier_1: first index = x = reach, second index = y = log_rev num_client_in_group_tier1 = [[0] * num_sectors_log_rev for x in xrange(num_sectors_reach)] print(' Check: must be 4x4 null: '), print(num_client_in_group_tier1) print('') print('------------------------------------------') print(' Reading the elements in each group from RES-04.01 *03-Count') #print (' +++ Date now = ' + day) name_df = folder_input + '/RES.Date-' + day + '.period-' + str( period) + '.clients_id-idxColorReach-idxYRev.03-Count.csv' print(' name_df = ' + name_df) df_input_3 = pd.read_csv(name_df, na_values=['None'], skiprows=6, skip_blank_lines=True, thousands=',', index_col=False) print(df_input_3.head()) #quit() for index, row in df_input_3.iterrows(): index_of_reach_in_vector = df_input_3['index_reach'][index] - 1 index_of_logrev_in_vector = df_input_3['index_revenues'][index] - 1 num_client_in_group[index_of_reach_in_vector][ index_of_logrev_in_vector] = df_input_3[ 'number_clients_in_sector'][index] num_client_in_group_tier1[index_of_reach_in_vector][ index_of_logrev_in_vector] = df_input_3[ 'number_clients_tier1'][index] # for index, row in df_input.iterrows(): # # counting number of elements in each group and how many are tier 1 -> plot of heatmap # # make plus 1 # index_of_reach_in_vector = df_input['index_quartile_reach'][index] - 1 # index_of_logrev_in_vector = df_input['index_quartile_log_rev_eur'][index] - 1 # num_client_in_group[index_of_reach_in_vector][index_of_logrev_in_vector] += 1 # # check # if df_input['is_tier_1'][index] == 1: # num_client_in_group_tier1[index_of_reach_in_vector][index_of_logrev_in_vector] += 1 # # # print ('print (index, index_of_reach_in_vector, index_of_logrev_in_vector, df_input[is_tier_1][index]) = '), # # print (index, index_of_reach_in_vector, index_of_logrev_in_vector, df_input['is_tier_1'][index]) # # print ('num_client_in_group : '), # # print (num_client_in_group) # # print ('num_client_in_group_tier1 : '), # # print (num_client_in_group_tier1) # # print ('----') print('Final count:') print('num_client_in_group : '), print(num_client_in_group) print('num_client_in_group_tier1 : '), print(num_client_in_group_tier1) del df_input_3 print('') print('------------------------------------------') print(' Plotting the Heatmap of number of clients') # create a dataframe to plot # https://stackoverflow.com/questions/12286607/python-making-heatmap-from-dataframe dfplot_index = [1, 2, 3, 4] # dfplot_cols = [1, 2, 3, 4] # dfplot = DataFrame(num_client_in_group, index=dfplot_index, columns=dfplot_cols) print(dfplot.head()) ax3 = sns.heatmap(dfplot, annot=True, fmt='g') # for t in ax3.texts: t.set_text(t.get_text() + " AllClients") # add percentage in notation plt.title('Clients: Date ' + day + ', Period ' + str(period) + '\nHow many clients in each sector?') plt.xlabel('Index of Log Revenue Euro') plt.yticks(rotation=0) plt.ylabel('Index of Reach') plt.yticks(rotation=0) plt.tight_layout() subfolder_now = '/03.01.heatmap-groups-all' # creating the folder if it does not exist directory = folder_RES + subfolder_now if not os.path.exists(directory): os.makedirs(directory) fig_name = folder_RES + subfolder_now + '/Heatmap-01-AllClients.Date-' + day + '.period-' + str( period) + '.png' print(' ==> Figure = ' + fig_name) plt.savefig(fig_name) plt.clf() del dfplot # clean dataframe per plot print('') print( '----------------------------------------------------------------------------' ) print( ' Plotting Graph with numbers of clients that are classified as tier1' ) # create a dataframe to plot # https://stackoverflow.com/questions/12286607/python-making-heatmap-from-dataframe dfplot2_index = [1, 2, 3, 4] # dfplot2_cols = [1, 2, 3, 4] # dfplot2 = DataFrame(num_client_in_group_tier1, index=dfplot2_index, columns=dfplot2_cols) print(dfplot2.head()) ax2 = sns.heatmap(dfplot2, annot=True, fmt='g') # for t in ax2.texts: t.set_text(t.get_text() + " Tier1") # add percentage in notation # NOTE: x and y are reversed here wrt to the graphs plt.title( 'Clients: Date ' + day + ', Period ' + str(period) + '\nHow many clients in each sector are classified as Tier1 by Criteo?' ) plt.xlabel('Index of Log Revenue Euro') plt.yticks(rotation=0) plt.ylabel('Index of Reach') plt.yticks(rotation=0) plt.tight_layout() subfolder_now = '/04.01.heatmap-groups-tier-1-absolute' # creating the folder if it does not exist directory = folder_RES + subfolder_now if not os.path.exists(directory): os.makedirs(directory) fig_name2 = folder_RES + subfolder_now + '/Heatmap-02-Tier1Abs.Date-' + day + '.period-' + str( period) + '.png' print(' ==> Figure = ' + fig_name2) plt.savefig(fig_name2) plt.clf() del dfplot2 print('') print( '----------------------------------------------------------------------------' ) print( ' Plotting Graph with % of clients that are tier 1 in each sector') # create a dataframe to plot # https://stackoverflow.com/questions/12286607/python-making-heatmap-from-dataframe dfplot3_index = [1, 2, 3, 4] # dfplot3_cols = [1, 2, 3, 4] # # compute percentage perc_clients_tier_1 = [[0] * num_sectors_log_rev for x in xrange(num_sectors_reach)] #print (perc_clients_tier_1) for i_reach in range(0, 4): for j_rev in range(0, 4): if num_client_in_group[i_reach][j_rev] == 0: perc_clients_tier_1[i_reach][j_rev] = 0.0 else: perc_clients_tier_1[i_reach][j_rev] = float( num_client_in_group_tier1[i_reach][j_rev] ) * 100.0 / float(num_client_in_group[i_reach][j_rev]) #print (perc_clients_tier_1) # quit() dfplot3 = DataFrame(perc_clients_tier_1, index=dfplot3_index, columns=dfplot3_cols) print(dfplot3.head()) ax3 = sns.heatmap(dfplot3, annot=True, fmt='.0f', vmin=0, vmax=100) # fmt='g') for t in ax3.texts: t.set_text(t.get_text() + " %") # add percentage in notation # NOTE: x and y are reversed here wrt to the graphs plt.title( 'Clients: Date ' + day + ', Period ' + str(period) + '\nHow much percentage of clients in each sector are classified as Tier1 by Criteo?' ) plt.xlabel('Index of Log Revenue Euro') plt.yticks(rotation=0) plt.ylabel('Index of Reach') plt.yticks(rotation=0) plt.tight_layout() subfolder_now = '/05.01.heatmap-groups-tier-1-percentage' # creating the folder if it does not exist directory = folder_RES + subfolder_now if not os.path.exists(directory): os.makedirs(directory) fig_name3 = folder_RES + subfolder_now + '/Heatmap-03-Tier1Perc.Date-' + day + '.period-' + str( period) + '.png' print(' ==> Figure = ' + fig_name3) plt.savefig(fig_name3) plt.clf() del dfplot3 print('') print('----------------------------------------') print(' Cleaning the dataframe for this period') del df_input #quit() # 1 date only # cycles start = start - timedelta(days=period) day = start.strftime('%Y-%m-%d') # clean the df #del clean_clust_df quit()
from pandas import DataFrame def add_state_names(my_df): # ##breakpoint() new_df = my_df.copy() names_map = {"CA": "Cali", "CO": "Colo", "CT": "Conn"} new_df["name"] = new_df["abbrev"].map(names_map) return new_df if __name__ == "__main__": df = DataFrame({"abbrev": ["CA", "CO", "CT", "DC", "TX"]}) #breakpoint() print(df.columns) # property print(df.head()) # method df2 = add_state_names(df) print(df2.head()) #df3 = DataFrame({"a":[1,2,3,4]}) #print(df3.head())
print_df(df.isnull().sum()) #평균 점수에 대한 열 추가하기 df['평균'] = df.mean(axis=1) conditions = [ (df['평균'] >= 90), (df['평균'] >= 80), (df['평균'] >= 70), (df['평균'] < 70), ] grade = ['A', 'B', 'C', 'F'] df['학점'] = numpy.select(conditions, grade) print_df(df.head(5)) #생성결과를 csv로 저장하기 NT = dt.datetime.now().strftime("%y%m%d_%H%M%S") filename = "grade" + NT + ".csv" df.to_csv(filename, encoding='euc-kr', na_rep='NaN', index_label='이름', header=['국', '영', '수', '과', '평균', '학점']) #데이터 시각화 cnt = df['학점'].value_counts() result_df = DataFrame(cnt) print_df(result_df)
print BabyDataSet[:10] df = DataFrame(data=BabyDataSet, columns=["Names", "Births"]) print df[:10] df.to_csv('births1880.txt', index=False, header=False) Location = r'births1880.txt' df = read_csv(Location) print df print df.head() df = read_csv(Location, header=None) print df print df.tail() df = read_csv(Location, names=['Names', 'Births']) print df.head() import os os.remove(Location)
def editor(interrogation, operation=None, denominator=False, sort_by=False, keep_stats=False, keep_top=False, just_totals=False, threshold='medium', just_entries=False, skip_entries=False, span_entries=False, merge_entries=False, just_subcorpora=False, skip_subcorpora=False, span_subcorpora=False, merge_subcorpora=False, replace_names=False, replace_subcorpus_names=False, projection=False, remove_above_p=False, p=0.05, print_info=False, spelling=False, selfdrop=True, calc_all=True, keyword_measure='ll', **kwargs): """ See corpkit.interrogation.Interrogation.edit() for docstring """ # grab arguments, in case we get dict input and have to iterate locs = locals() import corpkit import re import collections import pandas as pd import numpy as np from pandas import DataFrame, Series from time import localtime, strftime try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: have_ipython = False try: from IPython.display import display, clear_output except ImportError: pass # new ipython error except AttributeError: have_ipython = False pass # to use if we also need to worry about concordance lines return_conc = False from corpkit.interrogation import Interrodict, Interrogation, Concordance if interrogation.__class__ == Interrodict: locs.pop('interrogation', None) from collections import OrderedDict outdict = OrderedDict() for i, (k, v) in enumerate(interrogation.items()): # only print the first time around if i != 0: locs['print_info'] = False if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self': denominator = interrogation # if df2 is also a dict, get the relevant entry if isinstance(denominator, (dict, Interrodict)): #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \ # sorted(set([i.lower() for i in list(denominator.keys())])): # locs['denominator'] = denominator[k] # fix: this repeats itself for every key, when it doesn't need to # denominator_sum: if kwargs.get('denominator_sum'): locs['denominator'] = denominator.collapse(axis='key') if kwargs.get('denominator_totals'): locs['denominator'] = denominator[k].totals else: locs['denominator'] = denominator[k].results outdict[k] = v.results.edit(**locs) if print_info: thetime = strftime("%H:%M:%S", localtime()) print( "\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (thetime, "'\n '".join(sorted(outdict.keys())))) return Interrodict(outdict) elif isinstance(interrogation, (DataFrame, Series)): dataframe1 = interrogation elif isinstance(interrogation, Interrogation): #if interrogation.__dict__.get('concordance', None) is not None: # concordances = interrogation.concordance branch = kwargs.pop('branch', 'results') if branch.lower().startswith('r'): dataframe1 = interrogation.results elif branch.lower().startswith('t'): dataframe1 = interrogation.totals elif branch.lower().startswith('c'): dataframe1 = interrogation.concordance return_conc = True else: dataframe1 = interrogation.results elif isinstance(interrogation, Concordance) or \ all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']): return_conc = True print('heree') dataframe1 = interrogation # hope for the best else: dataframe1 = interrogation the_time_started = strftime("%Y-%m-%d %H:%M:%S") pd.options.mode.chained_assignment = None try: from process import checkstack except ImportError: from corpkit.process import checkstack if checkstack('pythontex'): print_info = False def combiney(df, df2, operation='%', threshold='medium', prinf=True): """ Mash df and df2 together in appropriate way """ totals = False # delete under threshold if just_totals: if using_totals: if not single_totals: to_drop = list( df2[df2['Combined total'] < threshold].index) df = df.drop([e for e in to_drop if e in list(df.index)]) if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print( 'Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show))) if len(to_drop) > 10: print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)) else: print('') else: denom = df2 else: denom = list(df2) if single_totals: if operation == '%': totals = df.sum() * 100.0 / float(df.sum().sum()) df = df * 100.0 try: df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '+': try: df = df.add(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '-': try: df = df.sub(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '*': totals = df.sum() * float(df.sum().sum()) try: df = df.mul(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '/': try: totals = df.sum() / float(df.sum().sum()) df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == 'a': for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2 elif operation.startswith('c'): import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pandas.concat([df, df2], axis=1) return df, totals elif not single_totals: if not operation.startswith('a'): # generate totals if operation == '%': totals = df.sum() * 100.0 / float(df2.sum().sum()) if operation == '*': totals = df.sum() * float(df2.sum().sum()) if operation == '/': totals = df.sum() / float(df2.sum().sum()) if operation.startswith('c'): # add here the info that merging will not work # with identical colnames import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") d = pd.concat([df.T, df2.T]) # make index nums d = d.reset_index() # sum and remove duplicates d = d.groupby('index').sum() dx = d.reset_index('index') dx.index = list(dx['index']) df = dx.drop('index', axis=1).T def editf(datum): meth = { '%': datum.div, '*': datum.mul, '/': datum.div, '+': datum.add, '-': datum.sub } if datum.name in list(df2.columns): method = meth[operation] mathed = method(df2[datum.name], fill_value=0.0) if operation == '%': return mathed * 100.0 else: return mathed else: return datum * 0.0 df = df.apply(editf) else: for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2.T.sum() return df, totals def skip_keep_merge_span(df): """ Do all skipping, keeping, merging and spanning """ from corpkit.dictionaries.process_types import Wordlist if skip_entries: if isinstance(skip_entries, (list, Wordlist)): df = df.drop(list(skip_entries), axis=1, errors='ignore') else: df = df.loc[:, ~df.columns.str.contains(skip_entries)] if just_entries: if isinstance(just_entries, (list, Wordlist)): je = [i for i in list(just_entries) if i in list(df.columns)] df = df[je] else: df = df.loc[:, df.columns.str.contains(just_entries)] if merge_entries: for newname, crit in merge_entries.items(): if isinstance(crit, (list, Wordlist)): crit = [i for i in list(crit) if i in list(df.columns)] cr = [i for i in list(crit) if i in list(df.columns)] summed = df[cr].sum(axis=1) df = df.drop(list(cr), axis=1, errors='ignore') else: summed = df.loc[:, df.columns.str.contains(crit)].sum(axis=1) df = df.loc[:, ~df.columns.str.contains(crit)] df.insert(0, newname, summed, allow_duplicates=True) if span_entries: df = df.iloc[:, span_entries[0]:span_entries[1]] if skip_subcorpora: if isinstance(skip_subcorpora, (list, Wordlist)): df = df.drop(list(skip_subcorpora), axis=0, errors='ignore') else: df = df[~df.index.str.contains(skip_subcorpora)] if just_subcorpora: if isinstance(just_subcorpora, (list, Wordlist)): js = [i for i in list(just_subcorpora) if i in list(df.index)] df = df.loc[js] else: df = df[df.index.str.contains(just_subcorpora)] if merge_subcorpora: df = df.T for newname, crit in merge_subcorpora.items(): if isinstance(crit, (list, Wordlist)): crit = [i for i in list(crit) if i in list(df.columns)] summed = df[list(crit)].sum(axis=1) df = df.drop(list(crit), axis=1, errors='ignore') else: summed = df.loc[:, df.columns.str.contains(crit)].sum(axis=1) df = df.loc[:, ~df.columns.str.contains(crit)] df.insert(0, newname, summed, allow_duplicates=True) df = df.T if span_subcorpora: df = df.iloc[span_subcorpora[0]:span_subcorpora[1], :] return df def parse_input(df, the_input): """turn whatever has been passed in into list of words that can be used as pandas indices---maybe a bad way to go about it""" parsed_input = False import re if the_input == 'all': the_input = r'.*' if isinstance(the_input, int): try: the_input = str(the_input) except: pass the_input = [the_input] elif isinstance(the_input, STRINGTYPE): regex = re.compile(the_input) parsed_input = [w for w in list(df) if re.search(regex, w)] return parsed_input from corpkit.dictionaries.process_types import Wordlist if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist: the_input = list(the_input) if isinstance(the_input, list): if isinstance(the_input[0], int): parsed_input = [ word for index, word in enumerate(list(df)) if index in the_input ] elif isinstance(the_input[0], STRINGTYPE): try: parsed_input = [ word for word in the_input if word in df.columns ] except AttributeError: # if series parsed_input = [ word for word in the_input if word in df.index ] return parsed_input def synonymise(df, pos='n'): """pass a df and a pos and convert df columns to most common synonyms""" from nltk.corpus import wordnet as wn #from dictionaries.taxonomies import taxonomies from collections import Counter fixed = [] for w in list(df.columns): try: syns = [] for syns in wn.synsets(w, pos=pos): for w in syns: synonyms.append(w) top_syn = Counter(syns).most_common(1)[0][0] fixed.append(top_syn) except: fixed.append(w) df.columns = fixed return df def convert_spell(df, convert_to='US', print_info=print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print('Converting spelling ... \n') if convert_to == 'UK': usa_convert = {v: k for k, v in list(usa_convert.items())} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df def merge_duplicates(df, print_info=print_info): if print_info: print('Merging duplicate entries ... \n') # now we have to merge all duplicates for dup in df.columns.get_duplicates(): #num_dupes = len(list(df[dup].columns)) temp = df[dup].sum(axis=1) #df = df.drop([dup for d in range(num_dupes)], axis=1) df = df.drop(dup, axis=1) df[dup] = temp return df def name_replacer(df, replace_names, print_info=print_info): """replace entry names and merge""" import re # get input into list of tuples # if it's a string, we want to delete it if isinstance(replace_names, STRINGTYPE): replace_names = [(replace_names, '')] # this is for some malformed list if not isinstance(replace_names, dict): if isinstance(replace_names[0], STRINGTYPE): replace_names = [replace_names] # if dict, make into list of tupes if isinstance(replace_names, dict): replace_names = [(v, k) for k, v in replace_names.items()] for to_find, replacement in replace_names: if print_info: if replacement: print('Replacing "%s" with "%s" ...\n' % (to_find, replacement)) else: print('Deleting "%s" from entry names ...\n' % to_find) to_find = re.compile(to_find) if not replacement: replacement = '' df.columns = [ re.sub(to_find, replacement, l) for l in list(df.columns) ] df = merge_duplicates(df, print_info=False) return df def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False): """makes appropriate name for merged entries""" if merging_subcorpora: if newname is False: newname = 'combine' if isinstance(newname, int): the_newname = list(df.columns)[newname] elif isinstance(newname, STRINGTYPE): if newname == 'combine': if len(parsed_input) <= 3: the_newname = '/'.join(parsed_input) elif len(parsed_input) > 3: the_newname = '/'.join(parsed_input[:3]) + '...' else: the_newname = newname if not newname: # revise this code import operator sumdict = {} for item in parsed_input: summed = sum(list(df[item])) sumdict[item] = summed the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0] if not isinstance(the_newname, STRINGTYPE): the_newname = str(the_newname, errors='ignore') return the_newname def projector(df, list_of_tuples, prinf=True): """project abs values""" if isinstance(list_of_tuples, list): tdict = {} for a, b in list_of_tuples: tdict[a] = b list_of_tuples = tdict for subcorpus, projection_value in list(list_of_tuples.items()): if isinstance(subcorpus, int): subcorpus = str(subcorpus) df.ix[subcorpus] = df.ix[subcorpus] * projection_value if prinf: if isinstance(projection_value, float): print('Projection: %s * %s' % (subcorpus, projection_value)) if isinstance(projection_value, int): print('Projection: %s * %d' % (subcorpus, projection_value)) if prinf: print('') return df def lingres(ser, index): from scipy.stats import linregress from pandas import Series ix = ['slope', 'intercept', 'r', 'p', 'stderr'] return Series(linregress(index, ser.values), index=ix) def do_stats(df): """do linregress and add to df""" try: from scipy.stats import linregress except ImportError: thetime = strftime("%H:%M:%S", localtime()) print('%s: sort type not available in this version of corpkit.' % thetime) return False indices = list(df.index) first_year = list(df.index)[0] try: x = [int(y) - int(first_year) for y in indices] except ValueError: x = list(range(len(indices))) stats = df.apply(lingres, axis=0, index=x) df = df.append(stats) df = df.replace([np.inf, -np.inf], 0.0) return df def resort(df, sort_by=False, keep_stats=False): """ Sort results, potentially using scipy's linregress """ # translate options and make sure they are parseable stat_field = ['slope', 'intercept', 'r', 'p', 'stderr'] easy_sorts = ['total', 'infreq', 'name', 'most', 'least', 'reverse'] stat_sorts = ['increase', 'decrease', 'static', 'turbulent'] options = stat_field + easy_sorts + stat_sorts sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'} sort_by = sort_by_convert.get(sort_by, sort_by) # probably broken :( if just_totals: if sort_by == 'name': return df.sort_index() else: return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1) stats_done = False if keep_stats or sort_by in stat_field + stat_sorts: df = do_stats(df) stats_done = True if isinstance(df, bool): if df is False: return False if isinstance(df, Series): if stats_done: stats = df.ix[range(-5, 0)] df = df.drop(list(stats.index)) if sort_by == 'name': df = df.sort_index() elif sort_by == 'reverse': df = df[::-1] else: df = df.sort_values(ascending=sort_by != 'total') if stats_done: df = df.append(stats) return df if sort_by == 'name': # currently case sensitive df = df.reindex_axis(sorted(df.columns), axis=1) elif sort_by in ['total', 'infreq']: if df1_istotals: df = df.T df = df[list( df.sum().sort_values(ascending=sort_by != 'total').index)] elif sort_by == 'reverse': df = df.T[::-1].T # sort by slope etc., or search by subcorpus name if sort_by in stat_field or sort_by not in options: asc = kwargs.get('reverse', False) df = df.T.sort_values(by=sort_by, ascending=asc).T if sort_by in ['increase', 'decrease', 'static', 'turbulent']: slopes = df.ix['slope'] if sort_by == 'increase': df = df[slopes.argsort()[::-1]] elif sort_by == 'decrease': df = df[slopes.argsort()] elif sort_by == 'static': df = df[slopes.abs().argsort()] elif sort_by == 'turbulent': df = df[slopes.abs().argsort()[::-1]] if remove_above_p: df = df.T df = df[df['p'] <= p] df = df.T # remove stats field by default if not keep_stats: df = df.drop(stat_field, axis=0, errors='ignore') return df def set_threshold(big_list, threshold, prinf=True): if isinstance(threshold, STRINGTYPE): if threshold.startswith('l'): denominator = 10000 if threshold.startswith('m'): denominator = 5000 if threshold.startswith('h'): denominator = 2500 if isinstance(big_list, DataFrame): tot = big_list.sum().sum() if isinstance(big_list, Series): tot = big_list.sum() tshld = float(tot) / float(denominator) else: tshld = threshold if prinf: print('Threshold: %d\n' % tshld) return tshld # copy dataframe to be very safe df = dataframe1.copy() # make cols into strings try: df.columns = [str(c) for c in list(df.columns)] except: pass if operation is None: operation = 'None' if isinstance(interrogation, Concordance): return_conc = True # do concordance work if return_conc: if just_entries: if isinstance(just_entries, int): just_entries = [just_entries] if isinstance(just_entries, STRINGTYPE): df = df[df['m'].str.contains(just_entries)] if isinstance(just_entries, list): if all(isinstance(e, STRINGTYPE) for e in just_entries): mp = df['m'].map(lambda x: x in just_entries) df = df[mp] else: df = df.ix[just_entries] if skip_entries: if isinstance(skip_entries, int): skip_entries = [skip_entries] if isinstance(skip_entries, STRINGTYPE): df = df[~df['m'].str.contains(skip_entries)] if isinstance(skip_entries, list): if all(isinstance(e, STRINGTYPE) for e in skip_entries): mp = df['m'].map(lambda x: x not in skip_entries) df = df[mp] else: df = df.drop(skip_entries, axis=0) if just_subcorpora: if isinstance(just_subcorpora, int): just_subcorpora = [just_subcorpora] if isinstance(just_subcorpora, STRINGTYPE): df = df[df['c'].str.contains(just_subcorpora)] if isinstance(just_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in just_subcorpora): mp = df['c'].map(lambda x: x in just_subcorpora) df = df[mp] else: df = df.ix[just_subcorpora] if skip_subcorpora: if isinstance(skip_subcorpora, int): skip_subcorpora = [skip_subcorpora] if isinstance(skip_subcorpora, STRINGTYPE): df = df[~df['c'].str.contains(skip_subcorpora)] if isinstance(skip_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora): mp = df['c'].map(lambda x: x not in skip_subcorpora) df = df[mp] else: df = df.drop(skip_subcorpora, axis=0) return Concordance(df) if print_info: print('\n***Processing results***\n========================\n') df1_istotals = False if isinstance(df, Series): df1_istotals = True df = DataFrame(df) # if just a single result else: df = DataFrame(df) if operation.startswith('k'): if sort_by is False: if not df1_istotals: sort_by = 'turbulent' if df1_istotals: df = df.T # figure out if there's a second list # copy and remove totals if there is single_totals = True using_totals = False outputmode = False if denominator.__class__ == Interrogation: try: denominator = denominator.results except AttributeError: denominator = denominator.totals if denominator is not False and not isinstance(denominator, STRINGTYPE): df2 = denominator.copy() using_totals = True if isinstance(df2, DataFrame): if len(df2.columns) > 1: single_totals = False else: df2 = Series(df2.iloc[:, 0]) elif isinstance(df2, Series): single_totals = True #if operation == 'k': #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?') else: if operation in ['k', 'a', '%', '/', '*', '-', '+']: denominator = 'self' if denominator == 'self': outputmode = True if operation.startswith('a') or operation.startswith('A'): if list(df.columns)[0] != '0' and list(df.columns)[0] != 0: df = df.T if using_totals: if not single_totals: df2 = df2.T if projection: # projection shouldn't do anything when working with '%', remember. df = projector(df, projection) if using_totals: df2 = projector(df2, projection) if spelling: df = convert_spell(df, convert_to=spelling) df = merge_duplicates(df, print_info=False) if not single_totals: df2 = convert_spell(df2, convert_to=spelling, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not df1_istotals: sort_by = 'total' if replace_names: df = name_replacer(df, replace_names) df = merge_duplicates(df) if not single_totals: df2 = name_replacer(df2, replace_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not sort_by: sort_by = 'total' if replace_subcorpus_names: df = name_replacer(df.T, replace_subcorpus_names) df = merge_duplicates(df).T df = df.sort_index() if not single_totals: if isinstance(df2, DataFrame): df2 = df2.T df2 = name_replacer(df2, replace_subcorpus_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if isinstance(df2, DataFrame): df2 = df2.T df2 = df2.sort_index() if not sort_by: sort_by = 'total' # remove old stats if they're there: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: df = df.drop(statfields, axis=0) except: pass if using_totals: try: df2 = df2.drop(statfields, axis=0) except: pass # remove totals and tkinter order for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and df1_istotals: continue try: df = df.drop(name, axis=ax, errors='ignore') except: pass for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and single_totals: continue try: df2 = df2.drop(name, axis=ax, errors='ignore') except: pass df = skip_keep_merge_span(df) try: df2 = skip_keep_merge_span(df2) except: pass # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) if just_totals: df = DataFrame(df.sum(), columns=['Combined total']) if using_totals: if not single_totals: df2 = DataFrame(df2.sum(), columns=['Combined total']) else: df2 = df2.sum() tots = df.sum(axis=1) if using_totals or outputmode: if not operation.startswith('k'): tshld = 0 # set a threshold if just_totals if outputmode is True: df2 = df.T.sum() if not just_totals: df2.name = 'Total' else: df2.name = 'Combined total' using_totals = True single_totals = True if just_totals: if not single_totals: tshld = set_threshold(df2, threshold, prinf=print_info) df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info) # if doing keywording... if operation.startswith('k'): if isinstance(denominator, STRINGTYPE): if denominator == 'self': df2 = df.copy() else: df2 = denominator from corpkit.keys import keywords df = keywords(df, df2, selfdrop=selfdrop, threshold=threshold, print_info=print_info, editing=True, calc_all=calc_all, sort_by=sort_by, measure=keyword_measure, **kwargs) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # resort data if sort_by or keep_stats: df = resort(df, keep_stats=keep_stats, sort_by=sort_by) if isinstance(df, bool): if df is False: return 'linregress' if keep_top: if not just_totals: df = df[list(df.columns)[:keep_top]] else: df = df.head(keep_top) if just_totals: # turn just_totals into series: df = Series(df['Combined total'], name='Combined total') if df1_istotals: if operation.startswith('k'): try: df = Series(df.ix[dataframe1.name]) df.name = '%s: keyness' % df.name except: df = df.iloc[0, :] df.name = 'keyness' % df.name # generate totals branch if not percentage results: # fix me if df1_istotals or operation.startswith('k'): if not just_totals: try: total = Series(df['Total'], name='Total') except: total = 'none' pass #total = df.copy() else: total = 'none' else: # might be wrong if using division or something... try: total = df.T.sum(axis=1) except: total = 'none' if not isinstance(tots, DataFrame) and not isinstance(tots, Series): total = df.sum(axis=1) else: total = tots if isinstance(df, DataFrame): if df.empty: datatype = 'object' else: datatype = df.iloc[0].dtype else: datatype = df.dtype locs['datatype'] = datatype # TURN INT COL NAMES INTO STR try: df.results.columns = [str(d) for d in list(df.results.columns)] except: pass def add_tkt_index(df): """add an order for tkintertable if using gui""" if isinstance(df, Series): df = df.T df = df.drop('tkintertable-order', errors='ignore', axis=0) df = df.drop('tkintertable-order', errors='ignore', axis=1) dat = [i for i in range(len(df.index))] df['tkintertable-order'] = Series(dat, index=list(df.index)) df = df.T return df # while tkintertable can't sort rows if checkstack('tkinter'): df = add_tkt_index(df) if kwargs.get('df1_always_df'): if isinstance(df, Series): df = DataFrame(df) # delete non-appearing conc lines lns = None if isinstance(getattr(interrogation, 'concordance', None), Concordance): try: col_crit = interrogation.concordance['m'].map( lambda x: x in list(df.columns)) ind_crit = interrogation.concordance['c'].map( lambda x: x in list(df.index)) lns = interrogation.concordance[col_crit] lns = lns.loc[ind_crit] lns = Concordance(lns) except ValueError: lns = None output = Interrogation(results=df, totals=total, query=locs, concordance=lns) if print_info: print('***Done!***\n========================\n') return output
X=iris.data ##### Now we will apply kmeans for each value of k from 1 to 10 # In[195]: KM=[kmeans(X,k) for k in K] print type(KM),len(KM) # In[196]: KM_df=DataFrame(KM) print KM_df.head(1) # In[197]: print KM_df.tail(1) # In[198]: KM_df.shape # In[199]: KM_v1=KM_df[0]
"2011-04-12 10:30:00.0000000", "2012-04-12 10:30:00.0000000") # table, timestamps, columns, data = service.sample_data('MAC000246', '2012-04-12 10:30:00.0000000', '2012-05-12 10:30:00.0000000') # table, timestamps, lables, data = service.hhourly_reading("MAC000246", "2011-04-12 10:30:00.0000000", "2012-04-12 10:30:00.0000000") # Solar # table, timestamps, lables, data = service.get_readings("control", "", "2015-04-15 18:00:00", "2015-04-15 19:16:18") table, timestamps, lables, data = service.parse_response( res, DailyReadingSchema()) sample = DataFrame(data=data, columns=lables) sample['timestamp'] = timestamps print(sample.describe()) print(sample.head()) sample.tss.format_index(timestamps, INFLUX_TS_FMT) subsample = sample[2:4] print(len(subsample)) sample.tss.day_of_week_class() # stationality = sample.tss.stationality('energy_mean') # # auto_corr = sample.tss.autocorrelation('energy_mean') stationality = sample.tss.stationality('power') auto_corr = sample.tss.autocorrelation('power')
from pandas import DataFrame import xlrd # xlsを読み込む際に必要 import numpy as np import sqlite3 # データフレームを作る smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'], 'year': [2000, 2001, 2002, 2001, 2002], 'pop': [1.5, 1.6, 1.7, 3.5, 4.3] } frame = DataFrame(smp) # データフレームの要素へのアクセス frame.year # frame$year frame['year'] # frame$year frame.head() # head frame.tail() # tail frame2 = DataFrame( smp, index=['one', 'two', 'three', 'four', 'five']) # インデックスを追加 frame2.ix['one'] frame2.describe() # summary print(frame2.describe()) # データを読み込む data = pd.read_csv('stock_px.csv') print(data) xlsx_file = pd.ExcelFile('stock_px.xlsx') # openpyxlのインストールが必要, xlsも可 xlsx_file.sheet_names data = xlsx_file.parse('stock_px') print(data)
# Create dataframe aonao = DataFrame({'AO': AO, 'NAO': NAO}) # Plot data aonao.plot(subplots=True) # Reference data by column name or method of dataframe variable print(aonao['NAO']) print(aonao.NAO) # Add column to dataframe aonao['Diff'] = aonao['AO'] - aonao['NAO'] # Show first several lines of new dataframe print(aonao.head()) # Remove column from dataframe del aonao['Diff'] # Show last few lines of dataframe print(aonao.tail()) # Show slice from dataframe print(aonao['1981-01':'1981-03']) # Complex indexing example import datetime aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0) & (aonao.index > datetime.datetime(1980, 1, 1)) &
} df = DataFrame(data).set_index("poly_range") df plt.plot(poly_range, df["mse_lr"], label="lr") plt.plot(poly_range, df["mse_ridge"], label="ridge") plt.plot(poly_range, df["mse_lasso"], label="lasso") plt.legend() plt.show() df.min() df["mse_ridge"].sort_values().head() ## Exercise df = pd.read_csv("./K-MOOC_machine_learning/ch4/yield.csv", sep="\t") df.head() X = df["Temp"] y = df["Yield"] X = X.reshape(-1, 1) mse = [] poly_range = range(2, 10) for poly_value in poly_range: poly_features = PolynomialFeatures(degree=poly_value) X_poly = poly_features.fit_transform(X) lr = LinearRegression() lr.fit(X_poly, y)
def file_load(self, datatype): ''' sales_dict和ads_dict 表示国家对应的广告数据和销售数据的文件目录 datatype= True,打开广告数据. False, 打开销售数据 start, end传入时间,可为None。目前暂时用于读取销售数据用。 读销售数据原理:用os.listdir找到数据月份文件夹(如:2017.03), 根据时间段与文件匹配,读取该时间段内的数据。 函数返回DataFrame对象 ''' ads_dict = { 'SXDE': '/data/SX/EU/Ads/DE/ads report/', 'SXES': '/data/SX/EU/Ads/ES/ads report/', 'SXFR': '/data/SX/EU/Ads/FR/ads report/', 'SXIT': '/data/SX/EU/Ads/IT/ads report/', 'SXUK': '/data/SX/EU/Ads/UK/ads report/', 'SXJP': '/data/SX/Japan/Ads/', 'SXCA': '/data/SX/North America/Ads/CA/ads report/', 'SXUS': '/data/SX/North America/Ads/USA/ads report/', 'HYYDE': '/data/HYY/EU/ads/DE/', 'HYYES': '/data/HYY/EU/ads/ES/', 'HYYFR': '/data/HYY/EU/ads/FR/', 'HYYIT': '/data/HYY/EU/ads/IT/', 'HYYUK': '/data/HYY/EU/ads/UK/', 'HYYJP': '/data/HYY/Japan/Ads/', 'HYYUS': '/data/HYY/North America/ads/USA/ads report/', 'TXHLDE': '/data/TXHL/EU/ads/DE/', 'TXHLES': '/data/TXHL/EU/ads/ES/', 'TXHLFR': '/data/TXHL/EU/ads/FR/', 'TXHLIT': '/data/TXHL/EU/ads/IT/', 'TXHLUK': '/data/TXHL/EU/ads/UK/', 'TXHLJP': '/data/TXHL/Japan/ads/', 'TXHLCA': '', 'TXHLUS': '', } sales_dict = { 'SXDE': '/data/SX/EU/business report/DE/', 'SXES': '/data/SX/EU/business report/ES/', 'SXFR': '/data/SX/EU/business report/FR/', 'SXIT': '/data/SX/EU/business report/IT/', 'SXUK': '/data/SX/EU/business report/UK/', 'SXJP': '/data/SX/Japan/business report/', 'SXCA': '/data/SX/North America/business report/CA/', 'SXUS': '/data/SX/North America/business report/USA/', 'HYYDE': '/data/HYY/EU/business report/DE/', 'HYYES': '/data/HYY/EU/business report/ES/', 'HYYFR': '/data/HYY/EU/business report/FR/', 'HYYIT': '/data/HYY/EU/business report/IT/', 'HYYUK': '/data/HYY/EU/business report/UK/', 'HYYJP': '/data/HYY/Japan/business report/', 'HYYCA': '/data/HYY/North America/business report/CA/', 'HYYUS': '/data/HYY/North America/business report/USA/', 'TXHLDE': '/data/TXHL/EU/business report/DE/', 'TXHLES': '/data/TXHL/EU/business report/ES/', 'TXHLFR': '/data/TXHL/EU/business report/FR/', 'TXHLIT': '/data/TXHL/EU/business report/IT/', 'TXHLUK': '/data/TXHL/EU/business report/UK/', 'TXHLJP': '/data/TXHL/Japan/business report/', 'TXHLCA': '', 'TXHLUS': '', } if datatype: ad_campaign = DataFrame() path = 'F:/PycharmFile' + ads_dict[self.store + self.country] # 广告数据文件地址 file_fold = self.end.strftime('%Y') + '.' + self.end.strftime('%m') # 需修改: 直接写出文件夹名,文件名file_name,如果存在,则打开文件,不存在,则查找 if os.path.isdir(path + file_fold): # 找到月份文件夹 file_name = "ADs_" + self.store + self.country + "_" + str(self.end.year) + "-" \ + str(self.end.month) + "-" + str(self.end.day) + ".txt" if self.country == "JP": ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='Shift-JIS') else: ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='utf-8') return ad_campaign else: sales_df = DataFrame() path = 'F:/PycharmFile' + sales_dict[self.store + self.country] #销售数据文件地址 delta = (self.end - self.start).days for i in range(delta + 1): date = (self.start + timedelta(days=i)) file_name = self.store + self.country + '-' + date.strftime('%y') + '-' + str(date.month)\ + '-' + str(date.day) + '.csv' for root, subdirs, files, in os.walk(path): for name in files: if name == file_name: print name file_path = root + '/' + name df = pd.read_csv(file_path, encoding='utf8') sales_df = pd.concat([sales_df, df]) sales_df = sales_df[[ u'(子)ASIN', u'商品名称', u'买家访问次数', u'买家访问次数百分比', u'页面浏览次数', u'页面浏览次数百分比', u'购买按钮页面浏览率', u'已订购商品数量', u'订单商品数量转化率', u'已订购商品销售额', u'订单商品种类数' ]] print sales_df.head() return sales_df
"popRanking wrap"}).text.strip() #print(site_name) #print(site_name,site_city_parent,site_link,site_rating,site_category,site_speciality,sep='\n') site = [ site_name, site_city_parent, site_link, site_rating, site_category, site_speciality ] sites.append(site) #print(sites[-1]) #print(len(sites)) return (sites) #listing_info=soup.find_all("div",attrs={"class":"listing_info"}) listing_info = soup.find_all("div", attrs={"class": "listing_details"}) sites = extract_details(listing_info) site_headers = [ "site_name", "site_city_parent", "site_link", "site_rating", "site_category", "site_speciality" ] len(sites) sites_df = DataFrame(sites) sites_df.columns = site_headers print(sites_df.head()) #site_link='https://www.tripadvisor.in/'+'Attraction_Review-g668046-d2441213-Reviews-Double_Decker_Living_Root_Bridge-Cherrapunjee_East_Khasi_Hills_District_Meghalaya.html' ##fetch_hours_GPS(site_link) ##page=urlopen(site_link) ##soup = BeautifulSoup(page,'html.parser') ##print(len(soup)) ##site_address=s.find("div",attrs={"class","detail_section address"}).text
# In[ ]: bizframe = DataFrame(bizrecords) userframe = DataFrame(userrecords) revframe = DataFrame(revrecords) checkinframe = DataFrame(checkinrecords) # In[ ]: users_df = DataFrame(userrecords, columns=['user_id','yelping_since','review_count', 'average_stars', 'variance_in_rating']) # In[ ]: users_df.head(n=10) # In[ ]: bizframe_sub = DataFrame(bizrecords, columns=['business_id', 'name', 'categories', 'review_count', 'stars']) # In[ ]: bizframe_sub.head(n=10) # In[ ]: bsort = bizframe_sub.sort_values(by='review_count', ascending=False)
df_roles = DataFrame(list(job_db.roles.find({}))) # In[7]: #COUNTING THE USER JOB EXPERIENCE df_UserJobExperience = DataFrame(list(job_db.UserJobExperience.find({}))) df_UserJobExperience['start_date'] = pd.to_datetime( df_UserJobExperience['start_date'], format='%d-%b-%Y %H:%M:%S') df_UserJobExperience['end_date'] = pd.to_datetime( df_UserJobExperience['end_date'], format='%d-%b-%Y %H:%M:%S') df_UserJobExperience['Experience in Months'] = ( df_UserJobExperience.end_date.dt.year - df_UserJobExperience.start_date.dt.year) * 12 + ( df_UserJobExperience.end_date.dt.month - df_UserJobExperience.start_date.dt.month) df_UserJobExperience.head() # # Merging dataframes # In[8]: df1 = pd.merge(df_PostJob, df_JobSkills, left_on='_id', right_on='job_id', how='left') df1.head().transpose() # In[9]: df2_jobs = pd.merge(df1, df_Company, on='company_id', how='left')
result=np.append(result,model.predict(test.ix[i])) # In[69]: len(result) # In[70]: submission=DataFrame({'Id':np.arange(1,9001,1),'Solution':result}) # In[71]: submission.head() # In[75]: submission.to_csv('submit1_KMeans.csv',index=False) ##### We have to also convert the solution column to int type from float else a 0 score is obtained # In[82]: submission.info() # In[91]:
mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table('./ml-1m/movies.dat', sep='::', header=None, names=mnames) movies.head() genre_iter = (set(x.split('|')) for x in movies.genres) print(genre_iter) genres = sorted(set.union(*genre_iter)) print(genres) dummies = DataFrame(np.zeros((len(movies),len(genres))),columns = genres) print(dummies.head()) for i, gen in enumerate(movies.genres): dummies.loc[i, gen.split('|')] = 1 # 给每部电影打标签 print(dummies.head()) movies_windic = movies.join(dummies.add_prefix('Genre_')) print(movies_windic.head()) values = np.random.rand(10) bins = [0, 0.2, 0.4, 0.6, 0.8, 1] print(pd.cut(values,bins))
def df_to_string(df: pd.DataFrame, name: str) -> str: """ Displays relevant information about the DF """ return f"{name} {df.shape} ({df.isna().sum().sum()} missing values) :\n" + df.head( ).__str__()
pd.set_option('display.width', None) # 设置字符显示宽度 pd.set_option('display.max_rows', None) # 设置显示最大行 pd.set_option('display.max_columns', None) # 设置显示最大行 client = pymongo.MongoClient('localhost', 27017) futures = client.futures2 market = futures.position # market = DataFrame(list(market.find({'date': {'$gte': '20190601'}}))) # # 删除数据 begin = DataFrame(list(market.find({}).sort([('_id', -1)]).limit(1))) print(begin.head()) begin = begin['date'][0] print("lastdate: "+begin) from pandas import Series, DataFrame import pandas as pd import numpy as np from datetime import datetime import time # dr=['2001-1-1','2030-1-1']
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( {'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( {'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with pytest.raises(KeyError): dfnu.loc[['E']] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected)
# placeholder generator # replace with your own code for k in []: yield k # <codecell> # use this code to run your code # I recommend replacing the None in islice to a small number to make sure you're on # the right track r = list(islice(places("NAME,P0010001"), None)) places_df = DataFrame(r) places_df.P0010001 = places_df.P0010001.astype("int") places_df["FIPS"] = places_df.apply(lambda s: s["state"] + s["place"], axis=1) print "number of places", len(places_df) print "total pop", places_df.P0010001.sum() places_df.head() # <codecell> # if you've done this correctly, the following asserts should stop complaining assert places_df.P0010001.sum() == 228457238 # number of places in 2010 Census assert len(places_df) == 29261
df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1) df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1) return df1 # <codecell> msa_list = list(islice(msas(P005_vars_with_name), None)) len(msa_list) # <codecell> msa_list dr = DataFrame(msa_list) dr = convert_P005_to_int(dr) dr.head() grouped = dr.groupby('metropolitan statistical area/micropolitan statistical area').sum() grouped.head() # <codecell> df_diversity = diversity(grouped) # <codecell> #'p_Asian', 'p_Black', 'p_Hispanic', 'p_Other','p_White' df_diversity['p_Asian'] = df_diversity['Asian']/df_diversity['Total'] df_diversity['p_Black'] = df_diversity['Black']/df_diversity['Total']
import statsmodels.api as sm from mtkirc import wd from pandas import DataFrame, Series, read_csv # -- Imports # rna-seq trans = read_csv('%s/data/kirc_transcriptomics_filtered_voom_de.txt' % wd, sep='\t', index_col=0) # targets tf_targets = read_csv('%s/tables/tfs_targets.txt' % wd, sep='\t', index_col=0) # -- Activities def calc_activity(c): ys = trans.ix[tf_targets.index, c].dropna() xs = tf_targets.ix[ys.index] xs['const'] = 1 lm = sm.OLS(ys, xs).fit() print lm.summary() return lm.tvalues.drop('const').to_dict() tf_activity = DataFrame({c: calc_activity(c) for c in trans}) print tf_activity.head() # -- Export tf_activity.to_csv('%s/tables/tf_activity.txt' % wd, sep='\t') print '[INFO] Done'
import pandas as pd from pandas import DataFrame ReadCsv = pd.read_csv(r'gloss_entryID_synonyms.csv', sep=';', header='infer') df_glosses = DataFrame(ReadCsv) df_glosses.head() new_df_glosses = df_glosses.drop_duplicates() df_glosses.to_csv('gloss_entryID_synonyms.csv', sep=',', index=False)
# <codecell> windowed_df = DataFrame(vpr_window_results) # <codecell> windowed_df.head() # <codecell> import dendropy fixtrees = glob.glob('newdomaintrees/*.nwk') for f in fixtrees: if 'Equal' not in f: continue with open(f) as handle: tree = dendropy.Tree.get_from_stream(open(f), 'nexus') tree.deroot() rmnodes = [tree.prune_subtree(t, update_splits = True) for t in tree.leaf_nodes() if t.get_node_str().endswith("copy'")] #tree.prune_taxa(rmnodes)
'CO': 'Colo', 'CT': 'Conn', 'DC': 'District of Columbia' } # create a new column which maps the existing column using our names map self.df['name'] = self.df['abbrev'].map(names_map) def inspect_columns(self): print(self.df.columns) if __name__ == "__main__": # Run example df = DataFrame({'abbrev': ['CA', 'CO', 'CT', 'DC', 'TX']}) print(df.head()) # Initialize a new wranger object, passing our df into the class and storing it as part of # the new wrangler object wrangler = Wrangler(df) # Call the inspect method wrangler.inspect_columns() # Call the addstate names method, this adds the new column to the df that is stored on the wranger object wrangler.add_state_names() # Print by calling the method on the df inside the wranger object print(wrangler.df.head())
#print random_names[:10] #print births[:10] dataset = zip(random_names,births) df = DataFrame(data=dataset, columns=['Names','Births']) #print df[:10] df.to_csv("births1880.txt",index=False,header=False) df = read_csv(r'./births1880.txt',names=["Names","Births"]) print "df.info over all info of df" print df.info() print "df.head - first 5 rows" print df.head() import os os.remove(r'./births1880.txt') uqNames = df['Names'].unique() print "df['names'].unique()" print uqNames print "df.names.describe()" print df['Names'].describe() df = df.groupby("Names") #group by name print df df = df.sum() # applys sum to each groupBy obj print df
message = '\n'.join(lines) yield path, message def dataFrameFromDirectory(path, classification): rows = [] index = [] for filename, message in readFiles(path): rows.append({'message': message, 'class': classification}) index.append(filename) return DataFrame(rows, index=index) data = DataFrame({'message': [], 'class': []}) data = data.append(dataFrameFromDirectory('emails/spam', 'spam')) data = data.append(dataFrameFromDirectory('emails/ham', 'ham')) print(data.head()) vectorizer = CountVectorizer() counts = vectorizer.fit_transform(data['message'].values) classifier = MultinomialNB() targets = data['class'].values classifier.fit(counts, targets) examples = ['Free Smartphones now!!!', 'Hello customers here you can get new watches for least cost'] example_counts = vectorizer.transform(examples) predictions = classifier.predict(example_counts) print(predictions)
def do(): train_data = pd.read_csv( 'D:/testFiles/for_excute_folder/activity_blFreight_2017_5_train_input.csv' ) test_data = pd.read_csv( 'D:/testFiles/for_excute_folder/activity_blFreight_2017_5_test_input.csv' ) # test_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blFreight_2017_5_train_input.csv') # train_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blFreight_2017_5_test_input.csv') # drop_col_names = ['Global-SystemAdmin'] train_data = train_data.drop(train_data.columns[0], axis=1) test_data = test_data.drop(test_data.columns[0], axis=1) train_data = train_data[train_data["TIME_USED"] <= 1000] test_data = test_data[test_data["TIME_USED"] <= 1000] # train_data = train_data[train_data["ASSIGN_COUNT"] <= 1] # test_data = test_data[test_data["ASSIGN_COUNT"] <= 1] # train_data = train_data.drop(drop_col_names, axis=1) # test_data = test_data.drop(drop_col_names, axis=1) train_data['TIME_USED'] = train_data['TIME_USED'] / 60 test_data['TIME_USED'] = test_data['TIME_USED'] / 60 train_data['TIME_USERD_MEDIAN_S2'] = train_data['TIME_USERD_MEDIAN']**2 test_data['TIME_USERD_MEDIAN_S2'] = test_data['TIME_USERD_MEDIAN']**2 #bkgOffice_median_by_task_type train_data['TIME_USERD_MEDIAN_S3'] = train_data[ 'TIME_USERD_MEDIAN'] * train_data['bkgOffice_median_by_task_type'] test_data['TIME_USERD_MEDIAN_S3'] = test_data[ 'TIME_USERD_MEDIAN'] * test_data['bkgOffice_median_by_task_type'] train_data['TIME_USERD_MEDIAN_S4'] = train_data[ 'bkgOffice_mean_by_task_type'] * train_data[ 'bkgOffice_median_by_task_type'] test_data['TIME_USERD_MEDIAN_S4'] = test_data[ 'bkgOffice_mean_by_task_type'] * test_data[ 'bkgOffice_median_by_task_type'] # train_data = train_data[ # ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT', # 'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']] # test_data = test_data[ # ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT', # 'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']] print(test_data.head()) # print(train_data.describe()) y_train = train_data['TIME_USED'].values.tolist() X_train = train_data.drop(['TIME_USED'], axis=1).values.tolist() y_test = test_data['TIME_USED'].values.tolist() X_test = test_data.drop(['TIME_USED'], axis=1).values.tolist() # 选一个模型 # regressor = SGDRegressor(l1_ratio=0.1) # regressor = Ridge() # regressor = Lasso() # regressor = SVR() # regressor = RandomForestRegressor(n_estimators=400, n_jobs=-1, max_features='sqrt') # regressor = AdaBoostRegressor() # regressor = GradientBoostingRegressor(n_estimators=400) # regressor = BaggingRegressor() regressor = XGBRegressor(n_estimators=400, learning_rate=0.02, colsample_bytree=0.1, seed=2017) # regressor = LGBMRegressor(n_estimators=400, learning_rate=0.02, seed=2017, colsample_bytree=1) # 用训练集做交叉验证 # scores = cross_val_score(regressor, X_train, y_train, cv=4, scoring='neg_mean_absolute_error', n_jobs=-1) # # print('交叉验证R方值:', scores) # print('交叉验证R方均值:', np.mean([scores])) # 用训练集训练模型 regressor.fit(X_train, y_train) # 用模型预测测试集, 打分方法也是r2 print('测试集R方值:', regressor.score(X_test, y_test)) # 对比预测数据与真实数据 y_predict = regressor.predict(X_test) df = DataFrame() df['predict'] = y_predict df['real'] = y_test df['diff'] = y_predict - y_test df['diff_abs'] = abs(df['diff']) df.sort_values(by='diff_abs', ascending=False, inplace=True) print(df.head(20)) print(df['diff_abs'].describe(percentiles=np.arange(0.1, 1, 0.1))) print('MAE = ', mean_absolute_error(y_test, y_predict)) print('MSE = ', mean_squared_error(y_test, y_predict)) print('R2 = ', r2_score(y_test, y_predict)) print('feature_importances\n')
plt.ylabel('# of houses') # In[5]: plt.scatter(boston.data[:, 5], boston.target) plt.ylabel('Price in 1000s') plt.xlabel('# of rooms') # In[6]: boston_df = DataFrame(boston.data) boston_df.columns = boston.feature_names # In[7]: boston_df.head() # In[8]: boston_df['Price'] = boston.target # In[9]: boston_df.head() # In[10]: sns.lmplot('RM', 'Price', data=boston_df) # In[11]:
from aggregate_ranking_representation.models import RankingName, RawRankingRecord qs_name = RankingName.objects.filter(short_name='QS') qs_name = RankingName.objects.filter(short_name='QS')[0] the_name = RankingName.objects.filter(short_name='THE')[0] from aggregate_ranking_representation.models import RawRankingRecord, RankingName from pandas import Series, DataFrame import pandas as pd qs_raw_records = qs_name.rawrankingrecord_set.all() qs_raw_records qs_raw_record0 = qs_raw_records[0] qs_raw_record0 qs_raw_records = qs_name.rawrankingrecord_set.all().values() qs_raw_records qs_df = DataFrame(qs_raw_records) qs_df = DataFrame(list(qs_raw_records)) qs_df.head() the_name = RankingName.objects.filter(short_name='THE') the_name = RankingName.objects.filter(short_name='THE')[0] the_raw_records = the_name.rawrankingrecord_set.all() the_df = DataFrame(list(the_raw_records.values())) the_df.head() the_df[:-1] the_df.index the_df.drop(0, axis=0) the_df.head() the_df.drop(0) the_df.head() the_df = DataFrame(list(the_raw_records.values())) the_df.head() the_df[0] the_df[1]
df = DataFrame(data) print(df) print('-------------------------') bbqurl = "https://www.bbq.co.kr/menu/menuList.asp" bbq = req.urlopen(bbqurl) print(bbq) soup = bs4.BeautifulSoup(bbq, 'lxml') datas = [] info = soup.select('div.info') #메뉴명 for i in info: tempPrice = i.select('p.pay')[0].text price = '' for j in tempPrice: try: int(j) price += j # 1 19 190 ... #print(price) except: pass datas += [[i.select('p.name')[0].text, int(price)]] df2 = DataFrame(datas, columns=['메뉴', '가격']) print(df2.head()) print('가격평균 :', df2['가격'].mean()) print('가격평균 :', df2['가격'].std())
# <codecell> from pandas import DataFrame import numpy as np from census import Census from us import states import settings c = Census(settings.CENSUS_KEY) r = c.sf1.get(('NAME', 'P0010001'), {'for': 'state:*'}) df1 = DataFrame(r) df1.head() # <codecell> len(df1) # <markdowncell> # **Q21**: Why does `df` have 52 items? Please explain # <markdowncell> # **A21**: # # When queried for "states", the US Census API returns data for the 50 states, the District of Columbia, and Puerto Rico: (50+1+1 = 52 entities).
def determine_classification(dataFrame): df = dataFrame df['Classification'] = df.apply(lambda row: calculate_classification(row), axis=1) df.head(15) return df
def plotter(title, df, kind = 'line', x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Accent', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: pandas.core.frame.DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass import matplotlib as mpl from matplotlib import rc # prefer seaborn plotting try: import seaborn as sns except: pass if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import pandas from pandas import DataFrame import numpy from time import localtime, strftime from tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines # check what environment we're in tk = check_t_kinter() running_python_tex = check_pytex() running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # check if we're doing subplots sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True kwargs['subplots'] = sbplt if colours is True: colours = 'Paired' # todo: get this dynamically instead. styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white'] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if style is not False and style.startswith('seaborn'): colours = False # use 'draggable = True' to make a draggable legend dragmode = kwargs.get('draggable', False) kwargs.pop('draggable', None) if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)): for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') except: pass else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', False) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if kind in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours # multicoloured bar charts if colours: if kind.startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? ... #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot / 7 # kwarg options go in leg_options leg_options = {'framealpha': .8, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1)} # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr(): if not sbplt: # check if negative values, no stacked if so if areamode: kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) if areamode: handles, labels = plt.gca().get_legend_handles_labels() del handles del labels else: plt.gcf().set_tight_layout(False) if not piemode: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) # this line allows layouts with missing plots # i.e. layout = (5, 2) with only nine plots plt.gcf().set_tight_layout(False) if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt: if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: if areamode: handles = handles[-len(handles) / 2:] labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] lgd = plt.legend(handles, labels, **leg_options) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) def is_number(s): """check if str can be can be made into float/int""" try: float(s) # for int, long and float except ValueError: try: complex(s) # for complex except ValueError: return False return True # for now, always turn off sci notation from matplotlib.ticker import ScalarFormatter if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: if all(is_number(s) for s in list(dataframe.index)): plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass try: if all(is_number(s) for s in list(dataframe.columns)): plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' def suplabel(axis,label,label_prop=None, labelpad=5, ha='center',va='center'): ''' Add super ylabel or xlabel to the figure Similar to matplotlib.suptitle axis - string: "x" or "y" label - string label_prop - keyword dictionary for Text labelpad - padding from the axis (default: 5) ha - horizontal alignment (default: "center") va - vertical alignment (default: "center") ''' fig = plt.gcf() xmin = [] ymin = [] for ax in fig.axes: xmin.append(ax.get_position().xmin) ymin.append(ax.get_position().ymin) xmin,ymin = min(xmin),min(ymin) dpi = fig.dpi if axis.lower() == "y": rotation=90. x = xmin-float(labelpad)/dpi y = 0.5 elif axis.lower() == 'x': rotation = 0. x = 0.5 y = ymin - float(labelpad)/dpi else: raise Exception("Unexpected axis: x or y") if label_prop is None: label_prop = dict() plt.gcf().text(x,y,label,rotation=rotation, transform=fig.transFigure, ha=ha,va=va, **label_prop) if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) else: if type(y_label) == str: the_y = y_label else: the_y = y_l #suplabel('y', the_y, labelpad = 1.5) plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical') #plt.subplots_adjust(left=0.5) # if not piemode: # if type(y_label) == str: # plt.ylabel(y_label) # else: # plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # show grid a.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if kind.startswith('bar'): width = ax.containers[0][0].get_width() # show grid ax.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) if 'layout' not in kwargs: if not sbplt: plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) if not interactive and not running_python_tex and not running_spider \ and not tk: plt.gcf().show() return elif running_spider or tk: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
def test_dups_fancy_indexing(self): # GH 3455 df = tm.makeCustomDataframe(10, 3) df.columns = ["a", "a", "b"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, index=["A", "A", "B", "C"], ) rows = ["C", "B"] expected = DataFrame( {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows ) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] with pytest.raises(KeyError, match="with any missing labels"): df.loc[rows] # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises( KeyError, match=re.escape( "\"None of [Index(['E'], dtype='object')] are in the [index]\"" ), ): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] df = DataFrame({"A": list("abc")}) with pytest.raises(KeyError, match="with any missing labels"): df.loc[[0, 8, 0]] # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) with pytest.raises(KeyError, match="with any missing labels"): df.loc[["A", "A", "E"]]
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( {'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( {'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( {'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]}, index=rows) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) expected = df.reindex(['E']) dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with catch_warnings(record=True): result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values df = DataFrame( np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat( [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index)], axis=1) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] tm.assert_frame_equal(result, expected)
# -*- coding: utf-8 -*- import pandas as pd import numpy as np from pandas import DataFrame, Series from sklearn.cluster import KMeans from sklearn.cluster import Birch #读取文件 datafile = 'MachineLearning\\DataSet\\go_track_trackspoints.csv' outfile = 'out.csv' data = pd.read_csv(datafile, usecols=["latitude", "longitude"]) d = DataFrame(data) d.head() # ----------------------------------聚类------------------------------------------- mod = KMeans(n_clusters=3, n_jobs=4, max_iter=500) #聚成3类数据,并发数为4,最大循环次数为500 mod.fit_predict(d) #y_pred表示聚类的结果 #聚成3类数据,统计每个聚类下的数据量,并且求出他们的中心 r1 = pd.Series(mod.labels_).value_counts() r2 = pd.DataFrame(mod.cluster_centers_) r = pd.concat([r2, r1], axis=1) r.columns = list(d.columns) + ["Clustering"] print(r) #给每一条数据标注上被分为哪一类 r = pd.concat([d, pd.Series(mod.labels_, index=d.index)], axis=1) r.columns = list(d.columns) + ["Clustering"] print(r.head()) r.to_csv(outfile) #如果需要保存到本地,就写上这一列