def commentsForStory(objectId, log): try: url = 'https://hn.algolia.com/api/v1/items/%d' % (objectId) req = urllib.request.Request(url) response = urllib.request.urlopen(req) data = json.loads(response.read().decode("utf-8")) except (KeyboardInterrupt, SystemExit): raise except IOError as e: message = '%d: %s' % (e.code, e.reason) log[str(objectId)] = message print(message) return tree = commentTree(data) commentRecords = preorderTraversalIgnoreRoot(tree) if len(commentRecords) == 0: log[str(objectId)] = NO_COMMENTS return columns = ['id', 'author', 'text', 'points', 'created_at', 'parent_id', 'story_id'] df = DataFrame(columns = columns, index = numpy.arange(len(commentRecords))) for index, comment in enumerate(commentRecords): df.ix[index] = comment df.to_csv("comments-by-story/comments-%d.csv" % objectId, encoding='utf-8', index=False) log[str(objectId)] = SUCCESS
def test_passing_dtype(self): # see gh-6607 df = DataFrame(np.random.rand(5, 2).round(4), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) # see gh-3795: passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) expected = df.astype(str) tm.assert_frame_equal(result, expected) # for parsing, interpret object as str result = self.read_csv(path, dtype=object, index_col=0) tm.assert_frame_equal(result, expected) # we expect all object columns, so need to # convert to test for equivalence result = result.astype(float) tm.assert_frame_equal(result, df) # invalid dtype self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'foo', 'B': 'float64'}, index_col=0) # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) tm.assert_frame_equal(actual, expected)
def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": pytest.skip("{compression} is not supported " "for to_csv".format(compression=compression)) # We'll complete file extension subsequently. filename = "test." if compression == "gzip": filename += "gz" else: # xz --> .xz # bz2 --> .bz2 filename += compression df = DataFrame({"A": [1]}) to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression with tm.ensure_clean(filename) as path: df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df)
def multivarAnalysis(): x = np.arange(-180,181) y = np.arange(-89,90) xvec = [xi for xi in x for yi in y] yvec = [yi for xi in x for yi in y] sali = [np.nan for i in xvec] temp = copy.copy(sali) sili = copy.copy(sali) nitr = copy.copy(sali) phos = copy.copy(sali) bath = copy.copy(sali) prod = copy.copy(sali) sali = fillList(sali,readNetcdf('gmtplots/netcdffiles/salinity.nc',['lon','lat','z'],1,noCheck,notNan)) temp = fillList(temp,readNetcdf('gmtplots/netcdffiles/temperature.nc',['lon','lat','z'],1,noCheck,notNan)) sili = fillList(sili,readNetcdf('gmtplots/netcdffiles/silicateAnnual.nc',['lon','lat','z'],1,noCheck,notNan)) nitr = fillList(nitr,readNetcdf('gmtplots/netcdffiles/nitrateAnnual.nc',['lon','lat','z'],1,noCheck,notNan)) phos = fillList(phos,readNetcdf('gmtplots/netcdffiles/phosphateAnnual.nc',['lon','lat','z'],1,noCheck,notNan)) bath = fillList(bath,readNetcdf('gmtplots/netcdffiles/bathymetryMasked.nc',['lon','lat','z'],1,noCheck,notNan)) prodSummer = readNetcdf('gmtplots/netcdffiles/summerAll.nc',['lon','lat','z'],1,noCheck,notNan) prodWinter = readNetcdf('gmtplots/netcdffiles/winterAll.nc',['lon','lat','z'],1,noCheck,notNan) prodSummer['classif'].extend(prodWinter['classif']) prodSummer['lon'].extend(prodWinter['lon']) prodSummer['lat'].extend(prodWinter['lat']) prod = fillList(prod,prodSummer) #ekki prenta ef ekkert i listanum stats = DataFrame([[i+1,xvec[i],yvec[i],sali[i],temp[i],sili[i],nitr[i],phos[i],bath[i],prod[i]] for i in range(len(xvec)) if not np.isnan([i+1,xvec[i],yvec[i],sali[i],temp[i],sili[i],nitr[i],phos[i],bath[i],prod[i]]).any()], index = None , columns = None) stats.to_csv('lithologyStats.m',sep = '\t', index = None , columns = None)
def predict(fname_specialists='net-specialists.pickle'): with open(fname_specialists, 'rb') as f: specialists = pickle.load(f) X = load2d(test=True)[0] y_pred = np.empty((X.shape[0], 0)) for model in specialists.values(): y_pred1 = model.predict(X) y_pred = np.hstack([y_pred, y_pred1]) columns = () for cols in specialists.keys(): columns += cols y_pred2 = y_pred * 48 + 48 y_pred2 = y_pred2.clip(0, 96) df = DataFrame(y_pred2, columns=columns) lookup_table = read_csv(os.path.expanduser(FLOOKUP)) values = [] for index, row in lookup_table.iterrows(): values.append(( row['RowId'], df.ix[row.ImageId - 1][row.FeatureName], )) now_str = datetime.now().isoformat().replace(':', '-') submission = DataFrame(values, columns=('RowId', 'Location')) filename = 'submission-{}.csv'.format(now_str) submission.to_csv(filename, index=False) print("Wrote {}".format(filename))
class matchbox: def __init__(self, articlepaths): self.num_exports = 0 self.num_articles_total = len(articlepaths) self.num_articles_matched = 0 self.num_matches = 0 self.dataframe = DataFrame() self.init_time = time.strftime("%Y-%m-%d_%H-%M-%S_") def update(self, matches): self.dataframe = self.dataframe.append(matches, ignore_index=True) self.num_articles_matched += 1 self.num_matches += len(matches) print('Matched {} places in article {} of {} ({:.2%} complete). ' 'Total: {}.'.format(len(matches), self.num_articles_matched, self.num_articles_total, self.num_articles_matched / self.num_articles_total, self.num_matches)) def empty_into_csv(self): self.num_exports += 1 outname = outdir + self.init_time + 'pubs_aegypti_' + str(self.num_exports) + '.csv' self.dataframe.to_csv(outname, encoding='utf-8') print('Wrote matches from chunk {} to {}.'.format(self.num_exports, outname)) del self.dataframe self.dataframe = DataFrame()
def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' # before the newline when the NaN-value is in # the first row. Otherwise, only the newline # character is added. This behavior is inconsistent # and was patched in https://bugs.python.org/pull_request4672. df1 = DataFrame([None, 1]) expected1 = """\ "" 1.0 """ with tm.ensure_clean('test.csv') as path: df1.to_csv(path, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected1 df2 = DataFrame([1, None]) expected2 = """\ 1.0 "" """ with tm.ensure_clean('test.csv') as path: df2.to_csv(path, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected2
def getIndexChangeRate(startDate,endDate): df_result = DataFrame() df = ts.get_hist_data('sh',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'sh' df_result = df_result.append(df) df = ts.get_hist_data('sz',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'sz' df_result = df_result.append(df) df = ts.get_hist_data('zxb',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'zxb' df_result = df_result.append(df) df = ts.get_hist_data('cyb',start =startDate,end = endDate ).reset_index() df['gap'] = df['high'] - df['low'] df['gap_rate'] = df['gap']/df['close']*100 df['mkt'] = 'cyb' df_result = df_result.append(df) fileName = r'D:\stock\index_changeRate_' +startDate+'_' + endDate + '.csv' df_result = df_result.loc[:,['date','mkt','close','volume','price_change','p_change','gap','gap_rate']] df_result = df_result.sort_index(by='date',ascending=False) df_result.to_csv(fileName,index = False)
def twitter_daily_aggregate(retrievaldate): #Date Retrieval d=[] dt = parser.parse(retrievaldate) + timedelta(days=-1) d.append(dt) d.append(d[-1] + timedelta(days=1)) #DataFrame Init ctrend = DataFrame() while d[-1] < datetime.utcnow(): print 'processing ', d[-1], ' ..........' #Daily Mention Count mnts = twitter_count(d, mentions) #User Follower Count usrs = twitter_follower(d,users) #Join trend = mnts.join(usrs) trend['Date'] = Period(d[-1],'D') #Append to DataFrame ctrend = concat([ctrend,trend]) #Extend Dates d.append(d[-1] + timedelta(days=1)) #Join DataFrames and Fill NAs ctrend = ctrend.fillna(0) #Save print 'printing the file' ctrend.to_csv('twitter_trend.csv') return ctrend
def test_to_csv_from_csv2(self): with ensure_clean('__tmp_to_csv_from_csv2__') as path: # duplicate index df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], columns=['x', 'y', 'z']) df.to_csv(path) result = DataFrame.from_csv(path) assert_frame_equal(result, df) midx = MultiIndex.from_tuples( [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) df = DataFrame(np.random.randn(3, 3), index=midx, columns=['x', 'y', 'z']) df.to_csv(path) result = DataFrame.from_csv(path, index_col=[0, 1, 2], parse_dates=False) # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it # ? assert_frame_equal(result, df, check_names=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_csv(path, header=col_aliases) rs = DataFrame.from_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) self.assertRaises(ValueError, self.frame2.to_csv, path, header=['AA', 'X'])
def test_to_csv_wide_frame_formatting(self): # Issue #8621 df = DataFrame(np.random.randn(1, 100010), columns=None, index=None) with ensure_clean() as filename: df.to_csv(filename, header=False, index=False) rs = read_csv(filename, header=None) assert_frame_equal(rs, df)
def test_to_csv_dtnat(self): # GH3437 from pandas import NaT def make_dtnat_arr(n, nnat=None): if nnat is None: nnat = int(n * 0.1) # 10% s = list(date_range('2000', freq='5min', periods=n)) if nnat: for i in np.random.randint(0, len(s), nnat): s[i] = NaT i = np.random.randint(100) s[-i] = NaT s[i] = NaT return s chunksize = 1000 # N=35000 s1 = make_dtnat_arr(chunksize + 5) s2 = make_dtnat_arr(chunksize + 5, 0) # s3=make_dtnjat_arr(chunksize+5,0) with ensure_clean('1.csv') as pth: df = DataFrame(dict(a=s1, b=s2)) df.to_csv(pth, chunksize=chunksize) recons = DataFrame.from_csv(pth)._convert(datetime=True, coerce=True) assert_frame_equal(df, recons, check_names=False, check_less_precise=True)
def test_to_csv_from_csv1(self): with ensure_clean('__tmp_to_csv_from_csv1__') as path: self.frame['A'][:5] = nan self.frame.to_csv(path) self.frame.to_csv(path, columns=['A', 'B']) self.frame.to_csv(path, header=False) self.frame.to_csv(path, index=False) # test roundtrip self.tsframe.to_csv(path) recons = DataFrame.from_csv(path) assert_frame_equal(self.tsframe, recons) self.tsframe.to_csv(path, index_label='index') recons = DataFrame.from_csv(path, index_col=None) assert(len(recons.columns) == len(self.tsframe.columns) + 1) # no index self.tsframe.to_csv(path, index=False) recons = DataFrame.from_csv(path, index_col=None) assert_almost_equal(self.tsframe.values, recons.values) # corner case dm = DataFrame({'s1': Series(lrange(3), lrange(3)), 's2': Series(lrange(2), lrange(2))}) dm.to_csv(path) recons = DataFrame.from_csv(path) assert_frame_equal(dm, recons)
def write_bar_as_csv(bt): CloseTime = bt.range_bar.CloseTime[:] High = bt.range_bar.High[:] Low = bt.range_bar.Low[:] Open = bt.range_bar.Open[:] Close = bt.range_bar.Close[:] CloseTime.reverse() High.reverse() Low.reverse() Open.reverse() Close.reverse() range_bar_df = DataFrame({'Date': CloseTime, 'H': High, 'L': Low, 'O': Open, 'C': Close}, columns=['Date', 'H', 'L', 'O', 'C']) strat = bt.strategies[bt.strategies.keys()[0]] for indicator_name in strat.indicators: curr_indicator = strat.indicators[indicator_name].val curr_indicator.reverse() range_bar_df[indicator_name] = curr_indicator print "Writing to: {}".format(bt.bar_data_root) range_bar_df.to_csv(path_or_buf=bt.bar_data_root, index=False)
def test_to_csv_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) result = buf.getvalue() expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n') self.assertEqual(result, expected) # quoting windows line terminators, presents with encoding? # #3503 text = 'a,b,c\n1,"test \r\n",3\n' df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) self.assertEqual(buf.getvalue(), text) # testing if quoting parameter is passed through with multi-indexes # related to issue #7791 df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
def test_to_csv_from_csv2(self): with ensure_clean('__tmp_to_csv_from_csv2__') as path: # duplicate index df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], columns=['x', 'y', 'z']) df.to_csv(path) result = self.read_csv(path) assert_frame_equal(result, df) midx = MultiIndex.from_tuples( [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) df = DataFrame(np.random.randn(3, 3), index=midx, columns=['x', 'y', 'z']) df.to_csv(path) result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df, check_names=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_csv(path, header=col_aliases) rs = self.read_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) msg = "Writing 4 cols but got 2 aliases" with pytest.raises(ValueError, match=msg): self.frame2.to_csv(path, header=['AA', 'X'])
def arrange_aggregates(cumsums, symbols, aggs): for i in symbols: cumsums[i] = cumsums[i].ix[:,0:5] cols = cumsums['ATL'].columns.tolist() cols2 = aggs['ATL'].columns.tolist() cols3 = (aggs['ATL'].columns + '1').tolist() cols.extend(cols2) cols.extend(cols3) ATL = DataFrame(columns = cols) for team in symbols: for Date in cumsums[team]['Date']: Opponent = cumsums[team].ix[cumsums[team]['Date'] == Date, 'Opponent'].all() cumsums_temp = cumsums[team].ix[cumsums[team]['Date'] == Date] cumsums_temp = cumsums_temp.reset_index() team_temp = aggs[team] oppenent_temp = DataFrame(aggs[Opponent]) oppenent_temp.columns = cols3 atl = pd.concat([cumsums_temp, team_temp, oppenent_temp], axis = 1) atl = atl.drop('index', axis=1) atl.columns = cols ATL = pd.concat([ATL, atl], axis = 0) print team ATL.to_csv('final.csv', sep=',', index=False)
def __extract_single_features(feature): """ Creates two files (one with unprocessed, and one with singular feature) containing questions that has the feature Arguments: feature: The feature(s) to look for Returns: tuple (pandas.DataFrame, pandas.DataFrame): Tuple that contains the dataframe with updated unprocessed questions (those that contains the given feature), and the other dataframe that has the features added to its question text """ up_name = "UP_" + feature.strip() new_index = old_index = 0 path = const.FILEPATH_TRAINING_DATA + FILENAME_START unprocessed_df = load_training_data(path, False, exclude_site_tags=True) feature_df = DataFrame.from_csv(__get_filename(const.FILEPATH_FEATURE_DETECTOR, feature)) new_up_dataframe = DataFrame(columns=unprocessed_df.columns.values) new_feat_dataframe = DataFrame(columns=unprocessed_df.columns.values) for question in feature_df[const.QUESTION_TEXT_KEY]: if feature in question: new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy() new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy() new_index += 1 old_index += 1 new_up_dataframe.to_csv(__get_filename(NEW_PATH, up_name), encoding='utf-8') new_feat_dataframe.to_csv(__get_filename(NEW_PATH, feature), encoding='utf-8')
def __extract_multiple_features(feature1, feature2, filename): """ Creates two files (one with unprocessed, and one with singular feature) containing questions that has the features Arguments: feature1: The first feature to look for feature2: The second feature to look for filename (str): File containing the features """ new_index = old_index = 0 up_name = "UP_" + filename.strip() path = const.FILEPATH_TRAINING_DATA + FILENAME_START unprocessed_df = load_training_data(path, False, exclude_site_tags=True) feature_df = DataFrame.from_csv(__get_filename(const.FILEPATH_FEATURE_DETECTOR, filename)) new_up_dataframe = DataFrame(columns=unprocessed_df.columns.values) new_feat_dataframe = DataFrame(columns=unprocessed_df.columns.values) for question in feature_df[const.QUESTION_TEXT_KEY]: if feature1 in question: new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy() new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy() new_index += 1 elif feature2 in question: new_feat_dataframe.loc[new_index] = feature_df.loc[old_index].copy() new_up_dataframe.loc[new_index] = unprocessed_df.loc[old_index].copy() new_index += 1 old_index += 1 new_up_dataframe.to_csv(__get_filename(NEW_PATH, up_name), encoding='utf-8') new_feat_dataframe.to_csv(__get_filename(NEW_PATH, filename), encoding='utf-8')
def predict(subject, data_path, model_path, submission_path): patient_filenames = [filename for filename in os.listdir(model_path) if subject in filename and filename.endswith('.pickle')] for filename in patient_filenames: print filename d = load_test_data(data_path, subject) x, id = d['x'], d['id'] with open(model_path + '/' + filename, 'rb') as f: state_dict = cPickle.load(f) scalers = state_dict['scalers'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) test_proba = cnn.get_test_proba(x) ans = zip(id, test_proba) df = DataFrame(data=ans, columns=['clip', 'preictal']) csv_name = '.'.join(filename.split('.')[:-1]) if '.' in filename else filename df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
def simple_commands(): # Initial set of baby names and birth rates. names = ['Bob', 'Jessica', 'Mary', 'John', 'Mel'] births = [968, 155, 77, 578, 973] # Merge name and birth lists into a single list of tuples (kind of a dict). BabyDataSet = list(zip(names, births)) # Export the BabyDataSet into a DataFrame (similar to a SQL table). df = DataFrame(data = BabyDataSet, columns = ['Name', 'Births']) # Export the data frame into a CSV file (without and without headers). file_nh = 'dat_births_1880_without_header.csv' file_wh = 'dat_births_1880_with_header.csv' df.to_csv(file_nh, index = False, header = False) df.to_csv(file_wh, index = True, header = True) # Read data from the CSV file without headers and assign new labels. df_nh = pd.read_csv(file_nh, header = None, names = ['Anda', 'La_osa']) print('\nFrom file without labels:') print(df_nh) # Read data from the CSV file with headers. df_wh = pd.read_csv(file_wh, header = 0, index_col = 0) print('\nFrom file with labels:') print(df_wh) # Check columns data types. print('\nChecking the column data types:') print(df_wh.dtypes) # Find the name with the highest birthrate. print('\nFinding the name with the highest birthrate:') sort = df_wh.sort_values(['Births'], ascending = False) print(sort.head(1)) # <- Select the 1st row of the sorted data frame. print(sort[1:2]) # Find the largest value within the 'Births' column of the original array. print('\nThe largest value within the column \'Births\' is:') print(df_wh['Births'].max())
def sanity_check(): """Check that the interaction and bookkeeping is OK. Set the agent to epsilon equal to 0.99. This makes almost all the actions to be selected uniformly at random. The action value for each context should follow the expected reward for each context. """ print('Running a contextual bandit experiment') cb = ContextualBandit() ca = ContextualAgent(cb, epsilon=0.99) steps = 10000 for _ in range(steps): ca.run() rewards = np.array(cb.actions) df = DataFrame(ca.log, columns=('context', 'action', 'reward', 'Q(c,a)')) fn = 'sanity_check.csv' df.to_csv(fn, index=False) print('Sequence written in', fn) print() for context, prob in cb.contexts.items(): print(context, ': ') print('samp : ', ca.Q[context]) print(' teo : ', prob * rewards - (1 - prob) * rewards) print() globals().update(locals())
def test_importItems(self): wrongFields = [{"a": "What is your gender?", "b": 0.7, "c": "radio", "d": 0.3, "e": "Male, Female, Other", "f": 'vert'}] wrongOptions = [{"questionText": "What is your gender?", "questionWidth": 0.7, "type": "radio", "responseWidth": 0.3, "options": "Other", "layout": 'vert', "index": 0}] df = DataFrame(self.questions) df.to_excel(fileName_xlsx, index=False) df.to_csv(fileName_csv, index=False) # Check wrong field error with pytest.raises(NameError): self.survey = Form(self.win, items=wrongFields, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False) # Check options for list of dicts with pytest.raises(ValueError): self.survey = Form(self.win, items=wrongOptions, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False) # Check csv self.survey = Form(self.win, items=fileName_csv, size=(1.0, 0.3), pos=(0.0, 0.0), autoLog=False) # Check Excel self.survey = Form(self.win, items=fileName_xlsx, size=(1.0, 0.3), pos=(0.0, 0.0), randomize=False, autoLog=False)
def predict(fname_specialists='net2.pickle'): with open(fname_specialists, 'rb') as f: net = pickle.load(f) X = load2d(test=True)[0] y_pred = net.predict(X) y_pred2 = y_pred * 48 + 48 y_pred2 = y_pred2.clip(0, 96) df = DataFrame(y_pred2) lookup_table = read_csv(os.path.expanduser(FLOOKUP)) values = [] for index, row in lookup_table.iterrows(): values.append(( row.RowId, y_pred2[int(row.ImageId)-1][int(row.RowId)%30-1] )) submission = DataFrame(values, columns=('RowId','Location')) filename = 'submission1.csv' submission.to_csv(filename, index=False) print("Wrote {}".format(filename))
def getFeatures(filename): csvfile = pd.read_csv(filename) # Reading .csv files containing tweets. tweet_ids = csvfile["id_str"] # Copying the 'id_str' attribute values to a item. length = len(tweet_ids) # Getting the length of 'tweet_ids'. df = DataFrame(d, index=[0]) # Creating a DataFrame twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2) ACCESS_TOKEN = twitter.obtain_access_token() twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) # Generating Access Token for i in range(0, length): status = twitter.show_status(id=tweet_ids[i]) d["id"] = status["id_str"].encode("utf-8") d["created_at"] = status["created_at"].encode("utf-8") d["from_user"] = status["user"]["screen_name"].encode("utf-8") d["followers_count"] = status["user"]["followers_count"] d["friends_count"] = status["user"]["friends_count"] d["statuses_count"] = status["user"]["statuses_count"] d["verified"] = status["user"]["verified"] d["location"] = 0 if (len(status["user"]["location"].encode("utf-8")) == 0) else 1 d["text"] = status["text"].encode("utf-8") d["retweet_count"] = status["retweet_count"] d["favorite_count"] = status["favorite_count"] d["hashtag_count"] = len(status["entities"]["hashtags"]) d["url_count"] = len(status["entities"]["urls"]) d["mentions_count"] = len(status["entities"]["user_mentions"]) if len(status["entities"]["urls"]) > 0: for x in range(0, len(status["entities"]["urls"])): d["links"] += status["entities"]["urls"][x]["expanded_url"].encode("utf-8") + " " df = df.append(d, ignore_index=True) df.to_csv("NSamples.csv") # Saving file to disk d["links"] = "" print "\nAll Done!"
def main(): ## Set default trace out dir, and get wide kernel names trace_out_dir = path.join(dir_script, "../output/trace") wide_kernel_names = get_wide_kernel_names_trace(trace_out_dir) ## Get wide bench names wide_bench_names = get_wide_bench_names(wide_kernel_names) duration_frame = DataFrame(index = wide_bench_names) duration_root_dir = path.join(dir_script, "../log") duration_frame['base_model'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'base_model')) duration_frame['model'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'model')) duration_frame['model_compare'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'opt_break/trace_off')) duration_frame['base_trace'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'base_trace')) duration_frame['trace'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'trace')) duration_frame['profiler'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'profiler')) duration_frame['sim'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'sim')) duration_frame['opt_break_trace_off'] = parse_duration_out(wide_bench_names, path.join(duration_root_dir, 'opt_break/trace_off')) breakdown_frame_index_wide_bench_name(duration_frame) duration_out_file = path.join(dir_script, "../output/duration.csv") duration_frame.to_csv(duration_out_file)
def predict(test_set: DataFrame, model: LogisticRegression, reg, filename): test_df = test_set.filter(regex=reg) test_np = test_df.as_matrix() predictions = model.predict(test_np) result = DataFrame({'PassengerId': test_set['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32)}) result.to_csv(filename, index=False)
def test_to_csv_decimal(self): # GH 781 df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) expected_default = ',col1,col2,col3\n0,1,a,10.1\n' assert df.to_csv() == expected_default expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n' assert df.to_csv(decimal=',', sep=';') == expected_european_excel expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n' assert df.to_csv(float_format='%.2f') == expected_float_format_default expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n' assert df.to_csv(decimal=',', sep=';', float_format='%.2f') == expected_float_format # GH 11553: testing if decimal is taken into account for '0.0' df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n' assert df.to_csv(index=False, decimal='^') == expected # same but for an index assert df.set_index('a').to_csv(decimal='^') == expected # same for a multi-index assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
def submission(fname_net="net4.pickle"): with open(fname_net, "rb") as f: net = pickle.load(f) # net = specialists X = load.load2d(test=True)[0] y_pred = net.predict(X) print "Finish predict test file" columns = "left_eye_center_x,left_eye_center_y,right_eye_center_x,right_eye_center_y,left_eye_inner_corner_x,left_eye_inner_corner_y,left_eye_outer_corner_x,left_eye_outer_corner_y,right_eye_inner_corner_x,right_eye_inner_corner_y,right_eye_outer_corner_x,right_eye_outer_corner_y,left_eyebrow_inner_end_x,left_eyebrow_inner_end_y,left_eyebrow_outer_end_x,left_eyebrow_outer_end_y,right_eyebrow_inner_end_x,right_eyebrow_inner_end_y,right_eyebrow_outer_end_x,right_eyebrow_outer_end_y,nose_tip_x,nose_tip_y,mouth_left_corner_x,mouth_left_corner_y,mouth_right_corner_x,mouth_right_corner_y,mouth_center_top_lip_x,mouth_center_top_lip_y,mouth_center_bottom_lip_x,mouth_center_bottom_lip_y" columns = columns.split(",") y_pred = y_pred * 48 + 48 y_pred = y_pred.clip(0, 96) df = DataFrame(y_pred, columns=columns) lookup_table = read_csv(os.path.expanduser("./data/IdLookupTable.csv")) values = [] for index, row in lookup_table.iterrows(): values.append((row["RowId"], df.ix[row.ImageId - 1][row.FeatureName])) now_str = datetime.now().isoformat().replace(":", "-") submission = DataFrame(values, columns=("RowId", "Location")) filename = "submission-{}.csv".format(now_str) submission.to_csv(filename, index=False) print ("Wrote {}".format(filename))
def main(train_file, test_file): #print "loading data.." csv.field_size_limit(1310720) trainreader = csv.reader (open( '/home/kiran/kdd/train.csv' )) projectid, traindata_old = zip (*trainreader) testreader = csv.reader (open ('/home/kiran/kdd/test.csv')) projectid, testdata_old = zip (*testreader) # remove stopwords traindata = [] testdata = [] for observation in traindata_old: traindata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False)) for observation in testdata_old: testdata.append(preprocess_pipeline(observation, "english", "PorterStemmer", True, True, False)) tfv = CountVectorizer (binary=1,ngram_range=(1, 1)) X_all = traindata + testdata lentrain = len(traindata) tfv.fit(X_all) X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:] scipy.io.mmwrite ('x_train_bin_1gram.mtx', X, field = 'real') scipy.io.mmwrite ('x_test_bin_1gram.mtx', X_test, field = 'real') myCols = tfv.get_feature_names () myCols = DataFrame (myCols) myCols.to_csv ('bin_1gram.csv', index=False)
callbacks=callbacks_list) # make a prediction yhat = model.predict(test_X) yhat = yhat[:, 0] allset = np.concatenate((yhat, yhat), axis=1) allset = np.concatenate((allset, yhat), axis=1) yhat = scaler.inverse_transform(allset)[:, [2]] pred = DataFrame(yhat) pred.columns = ["pred"] real = GRN_MTR.iloc[(len(GRN_MTR) - len(pred)):, :] real.columns = ["real"] ambas = pd.concat([pred.set_index(real.index), real], axis=1, ignore_index=False) ambas['diff'] = ambas.real - ambas.pred ambas = DataFrame(ambas) ambas.columns = ["pred", "real", "diff"] # Save results plt.figure() plot = ambas.plot(figsize=(30,5),title="Prediction vs real "+"test "\ +args.yi+"/"+args.yf+" "+args.mi+"/"+args.mf) fig = plot.get_figure() fig.savefig("/figs/Prediction_vs_real_test_"+args.yi+"-"+args.yf\ +"_"+args.mi+"-"+args.mf+".png") ambas.to_csv("/data/Prediction_vs_real_test_"+args.yi+\ "_"+args.yf+"_"+args.mi+"_"+args.mf+".csv",index=False)
def write_results_to_file(self): print(logstamp(), "WRITING RESULTS...") df = DataFrame(self.results) df.index.name = "row_id" df.index = df.index + 1 df.to_csv(self.local_results_filepath)
def dump_results( dframe: pd.DataFrame, csvfile: Optional[str] = None, yamlfile: Optional[str] = None, resinsightfile: Optional[str] = None, ertfile: Optional[str] = None, ) -> None: """Dump dataframe with ERT observations to CSV and/or YML format to disk. Writes to stdout if filenames are "-". Skips export if filenames are empty or None. Args: dframe csvfile: Filename yamlfile: Filename resinsightfile: Filename ertfile: Filename """ if not (csvfile or yamlfile or resinsightfile or ertfile): logger.warning("No output filenames provided") if csvfile: if csvfile != __MAGIC_STDOUT__: logger.info("Writing observations as CSV to %s", csvfile) dframe.to_csv(csvfile, index=False) else: # Ignore pipe errors when writing to stdout: signal.signal(signal.SIGPIPE, signal.SIG_DFL) dframe.to_csv(sys.stdout, index=False) if yamlfile and yamlfile: obs_dict_for_yaml = df2obsdict(dframe) if not obs_dict_for_yaml and not dframe.empty: logger.error("None of your observations are supported in YAML") yaml_str = yaml.safe_dump(obs_dict_for_yaml) if yamlfile != __MAGIC_STDOUT__: logger.info( "Writing observations in YAML (webviz) format to file: %s", resinsightfile, ) with open(yamlfile, "w") as f_handle: f_handle.write(yaml_str) else: print(yaml_str) if resinsightfile: ri_dframe = df2resinsight_df(dframe) if resinsightfile != __MAGIC_STDOUT__: logger.info( "Writing observations in ResInsight format to CSV-file: %s", resinsightfile, ) ri_dframe.to_csv(resinsightfile, index=False, sep=";") else: # Ignore pipe errors when writing to stdout: signal.signal(signal.SIGPIPE, signal.SIG_DFL) ri_dframe.to_csv(sys.stdout, index=False, sep=";") if ertfile: ertobs_str = df2ertobs(dframe) if ertfile != __MAGIC_STDOUT__: with open(ertfile, "w") as f_handle: logger.info("Writing ERT observation format to %s", ertfile) f_handle.write(ertobs_str) else: print(ertobs_str)
prev_price = current_price if continuous_increase_counter == 0: i += 1 else: i += continuous_increase_counter return pivots_list def convert_pivot_sequences_to_training_seq(directory, filename_with_seq, filename_with_ohlc): pivot_sequences = read_csv(directory + filename_with_seq, sep=';', encoding='utf-8', index_col=0) ohlc = read_csv(directory + filename_with_ohlc, sep=';', encoding='utf-8', index_col=0) py.init_notebook_mode(connected=True) # file_path = save_all_trades('LTCUSD') # print_ohlc_from_csv('result/LTCUSD/', 'LTCUSD_2017_10_11_12_01_03.txt', 'LTCUSD', mode='markers') result = find_pivot_sequences('result/LTCUSD/', 'LTCUSD_2017_bid.csv') data = DataFrame(result, columns=['start_index', 'max_index', 'length', 'pivot_price', 'current_time']) data.to_csv('result/LTCUSD/LTCUSD_2017_pivots.csv', sep=';', encoding='utf-8') training_sequences = convert_pivot_sequences_to_training_seq('result/LTCUSD/', 'LTCUSD_2017_pivots.csv') training data_frame = DataFrame(training_sequences, columns=['start', 'max', 'length', 'pivot_price', 'current_time']) training.to_csv('result/LTCUSD/LTCUSD_2017_pivots.csv', sep=';', encoding='utf-8') #result = find_longest_continious_sequence('result/LTCUSD/', 'LTCUSD_2017_10_11_12_01_03_bid.csv', 'LTCUSD') print("Finish")
def save_disclosure_data(df: DataFrame): df.to_csv("../data/pfd_final.csv")
train_corpus = clean_text(df_train) cv = CountVectorizer(max_features=1500) X_train = cv.fit_transform(train_corpus).toarray() Y_train = df_train.iloc[:, 2] from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() classifier.fit(X_train, Y_train) test_corpus = clean_text(df_test) X_test = cv.fit_transform(test_corpus).toarray() Y_pred = classifier.predict(X_test) from pandas import DataFrame pred = DataFrame(Y_pred) pred.to_csv(r'C:\Users\kajah\Desktop\result.csv') #from sklearn.naive_bayes import GaussianNB #classifier = GaussianNB() #classifier.fit(X_train,Y_train) # #Y_pred = classifier.predict(X_test) #from sklearn.svm import SVC #classifier = SVC(kernel = 'linear', random_state = 0) #classifier.fit(X_train, Y_train) ## Predicting the test set results #Y_pred = classifier.predict(X_test)
for i in friends: value = i[var] variable.append(value) return variable #调用函数得到各变量,并把数据存到csv文件中,保存到桌面 NickName = get_var("NickName") Sex = get_var('Sex') Province = get_var('Province') City = get_var('City') Signature = get_var('Signature') from pandas import DataFrame data = { 'NickName': NickName, 'Sex': Sex, 'Province': Province, 'City': City, 'Signature': Signature } frame = DataFrame(data) frame.to_csv('data2.csv', index=True, encoding='utf8') import re siglist = [] for i in friends: signature = i["Signature"].strip().replace("span", "").replace( "class", "").replace("emoji", "") rep = re.compile("1f\d+\w*|[<>/=]") signature = rep.sub("", signature) siglist.append(signature) text = "".join(siglist)
# God Player God = [200, 60] # God[0] = -150 and God[1] = 35 # Combining Datasets SecondB = SecondB_PCA.join(SecondB_Info) #print(FirstB['pc1'][0]) # create list of the best Players Closiest_to_God = np.empty([len(SecondB), 1]) count = 0 while count < len(SecondB): Closiest_to_God[count][0] = sqrt(((SecondB['pc1'][count] - God[0])**2) + ((SecondB['pc2'][count] - God[1])**2)) #Closiest_to_God[count][1] = FirstB_Names['Players'][count+1] #print(Closiest_to_God[count][0]) # print(count) count += 1 if count >= len(SecondB): break print(len(Closiest_to_God)) print(len(SecondB_Info)) Closiest_to_God = np.hstack((Closiest_to_God, SecondB_Info[1:])) # Attaches the names of each player to their Euclidean distance to God Closiest_to_God = Closiest_to_God[Closiest_to_God[:, 0].argsort()] print(Closiest_to_God) ##Export to CSV dataset = DataFrame(Closiest_to_God) df = dataset.to_csv(r'export/SecondB_List.csv', index=None, header=True) print(dataset)
# coding: utf-8 import pandas as pd import numpy as np from pandas import DataFrame lgb_sub = pd.read_csv("../buptloc/sub_lgb_12_0527_1140.csv") lgb_sub = DataFrame(lgb_sub) for j in lgb_sub.columns: for k in range(len(lgb_sub)): if lgb_sub[j][k] < 0: lgb_sub[j][k] = - lgb_sub[j][k] lgb_sub.to_csv("../buptloc/sub_lgb_12_0527_1140.csv")
def _write_loop_block(self, df: pd.DataFrame): self.write_loopheader(df) df.to_csv(self.filename, mode='a', sep=self.sep, header=False, index=False, float_format=self.float_format, na_rep=self.na_rep)
def _1(data: pd.DataFrame) -> QualityFilterStatsFmt: ff = QualityFilterStatsFmt() data.to_csv(str(ff)) return ff
categoria = 4 #feito if 'decretada a liquidacao extrajudicial' in pub: categoria = 5 #if 'recuperacao judicial' in pub: #if 'processamento da recuperacao judicial' in pub and ('concedido o processamento' in pub or 'plano de recuperacao' in pub): if 'processamento da recuperacao judicial' in pub: categoria = 6 if categoria != 0: id += 0 dict[pub] = categoria #print (id.__str__()+"\n\nPUB =>"+i.__str__()+"\nCATEGORIA IDENTIFICADA => "+categoria.__str__()) if (id == -300): break if id == -300: break if id == -300: break if id == -300: break print("Arquivos lidos {0}".format(arqs_lidos)) print("Pubs lidas {0}".format(pubs_lidas)) df = DataFrame(list(dict.items()), columns=['pub', 'categoria']) print(df.categoria.value_counts()) export_csv = df.to_csv(r'export_dataframe_13112019.csv', index=None, header=True)
def write_xgb_predictions(predictions: pd.DataFrame, summary_file): predictions = flatten_predictions(predictions) output = predictions.to_csv(index=False, float_format="%f") with open(summary_file, "wt") as fobj: fobj.write(output)
df_4["Parch_C"] = df_4["Parch"].astype('category') df_4["Titel_C"] = df_4["Titel"].astype('category') df_4["Age_G"] = pd.cut( df_4["Age"], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 120], labels=[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 120]) df_4["Age_E"] = pd.cut(df_4["Age"], [0, 15, 20, 120], labels=[15, 20, 120]) df_4["Age_Er"] = pd.cut(df_4["Age"], [0, 15, 120], labels=[15, 120]) df_4.dtypes t, Xtest = dmatrices( 'survived ~ Parch + SibSp + C(Pclass) + C(Titel) +C(Alone)', df_2, return_type="dataframe") Xtest = Xtest.rename( columns={ 'C(Pclass)[T.2]': 'midClass', 'C(Pclass)[T.3]': 'lowClass', 'C(Sex)[T.male]': 'male', 'C(Age_Er)[T.120]': 'Erwachsen' }) yPredict = svm_Model.predict(Xtest) myResult = DataFrame() myResult["PassengerId"] = df_4["PassengerId"] myResult["Survived"] = yPredict myResult.dtypes myResult["Survived"] = myResult["Survived"].astype('int64') myResult = myResult.sort(['PassengerId']) myResult.to_csv("H:/DATA/Python/Titanic/myResult.csv", index=False)
def load_df( self, df: pandas.DataFrame, table: str, field_dict: Optional[Dict[Any, Any]] = None, delimiter: str = ',', encoding: str = 'utf8', pandas_kwargs: Any = None, **kwargs: Any, ) -> None: """ Loads a pandas DataFrame into hive. Hive data types will be inferred if not passed but column names will not be sanitized. :param df: DataFrame to load into a Hive table :type df: pandas.DataFrame :param table: target Hive table, use dot notation to target a specific database :type table: str :param field_dict: mapping from column name to hive data type. Note that it must be OrderedDict so as to keep columns' order. :type field_dict: collections.OrderedDict :param delimiter: field delimiter in the file :type delimiter: str :param encoding: str encoding to use when writing DataFrame to file :type encoding: str :param pandas_kwargs: passed to DataFrame.to_csv :type pandas_kwargs: dict :param kwargs: passed to self.load_file """ def _infer_field_types_from_df(df: pandas.DataFrame) -> Dict[Any, Any]: dtype_kind_hive_type = { 'b': 'BOOLEAN', # boolean 'i': 'BIGINT', # signed integer 'u': 'BIGINT', # unsigned integer 'f': 'DOUBLE', # floating-point 'c': 'STRING', # complex floating-point 'M': 'TIMESTAMP', # datetime 'O': 'STRING', # object 'S': 'STRING', # (byte-)string 'U': 'STRING', # Unicode 'V': 'STRING', # void } order_type = OrderedDict() for col, dtype in df.dtypes.iteritems(): order_type[col] = dtype_kind_hive_type[dtype.kind] return order_type if pandas_kwargs is None: pandas_kwargs = {} with TemporaryDirectory(prefix='airflow_hiveop_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir, mode="w") as f: if field_dict is None: field_dict = _infer_field_types_from_df(df) df.to_csv( path_or_buf=f, sep=delimiter, header=False, index=False, encoding=encoding, date_format="%Y-%m-%d %H:%M:%S", **pandas_kwargs, ) f.flush() return self.load_file(filepath=f.name, table=table, delimiter=delimiter, field_dict=field_dict, **kwargs)
from pandas import DataFrame import sqlite3 as sql with sql.connect("app.db") as con: con.row_factory = sql.Row #Query 1 - Whether or not a given username, password combination is valid: right username cur = con.cursor() cur.execute( "SELECT user.uid FROM user WHERE user.uid = 1 AND user.password = '******'" ) rows = cur.fetchall() df = DataFrame(rows) df.to_csv(r'./app/csv/01test.csv', index=False) cur.close() # Query 2 - Whether or not a given username, password combination is valid: wrong username cur = con.cursor() cur.execute( "SELECT user.uid FROM user WHERE user.uid = 1 AND user.password = '******'" ) rows = cur.fetchall() df = DataFrame(rows) df.to_csv(r'./app/csv/02test.csv', index=False) cur.close() # Query 3 - Airlines whose name or abbreviation contain some substring cur = con.cursor() cur.execute( "SELECT * FROM airline WHERE airline.name LIKE '%Delta%' OR airline.iata LIKE '%Delta'" )
from pandas import DataFrame from pyhive import hive import boto3 import json s3 = boto3.client('s3') s3.download_file('aws-logs-723293022411-us-east-1', 'bootstrap-test/hiveConf.json', 'hiveConf.json') f = open("hiveConf.json", "r") jsonFile = json.load(f) cursor = hive.connect(jsonFile["hiveConf"]["host"]).cursor() cursor.execute(jsonFile["hiveConf"]["execute1"]) cursor.execute(jsonFile["hiveConf"]["execute2"]) cursor.execute(jsonFile["hiveConf"]["execute3"]) df = DataFrame(cursor.fetchall()) df.to_csv(jsonFile["hiveConf"]["saveTo"])
clf = XGBClassifier(learning_rate=0.1, n_estimators=20, max_depth=4, objective='binary:logistic') folds = KFold(10, True, 1) #getshapvalue(instances, y, clf, vdim) auc_score, accuracy, sensitivity, specificity, MCC = getCrossValidation( instances, y, vdim, clf, folds) print('results for feature:' + featurename) print( '****AUC score:%.3f, accuracy:%.3f, sensitivity:%.3f, specificity:%.3f, MCC:%.3f****' % (auc_score, accuracy, sensitivity, specificity, MCC)) toc = time.clock() print('The prediction time: %.3f minutes' % ((toc - tic) / 60.0)) print( '###############################################################################\n' ) # output result results = DataFrame({'Feature': [featurename], \ 'AUC': [auc_score], \ 'ACC': [accuracy], \ 'SN': [sensitivity], \ 'SP': [specificity], \ 'MCC': [MCC]} ) results = results[['Feature', 'AUC', 'ACC', 'SN', 'SP', 'MCC']] results.to_csv(featurename + 'Results1.csv', index=False)
fig = plt.figure() ax = fig.add_subplot(111, projection='3d') x = np.array(df2['TipoMaturità']) y = np.array(df2['VotoDiploma']) z = np.array(df2['CFU1']) ax.set_xlabel('TipoMaturità') ax.set_ylabel('VotoDiploma') ax.set_zlabel('CFU1') ax.scatter(x,y,z, marker="s", c=kmeans.labels_.astype(float), s=50, cmap="RdBu", alpha=0.5 ) plt.show() """ df_final.to_csv("./ListaStudentCluster.csv", sep=',', index=False,) ClusterFile = [tuple(row) for row in csv.reader(open("./ListaStudentCluster.csv", 'r'))] #AGGIUSTARE I PATH, IN MODO DA RAGGIUNGERE QUESTO FILE for i in range(1, len(ClusterFile)): if int(ClusterFile[i][4]) == 0: count0 = count0 +1 if int(ClusterFile[i][3]) > 0: countFc0 = countFc0 + 1 if int(ClusterFile[i][4]) == 1: count1 = count1 +1 if int(ClusterFile[i][3]) > 0: countFc1 = countFc1 + 1 """
def upload_predictions(self, file_path: str = "predictions.csv", tournament: int = 8, model_id: str = None, df: pd.DataFrame = None) -> str: """Upload predictions from file. Will read TRIGGER_ID from the environment if this model is enabled with a Numerai Compute cluster setup by Numerai CLI. Args: file_path (str): CSV file with predictions that will get uploaded tournament (int): ID of the tournament (optional, defaults to 8) -- DEPRECATED there is only one tournament nowadays model_id (str): Target model UUID (required for accounts with multiple models) df (pandas.DataFrame): pandas DataFrame to upload, if function is given df and file_path, df will be uploaded. Returns: str: submission_id Example: >>> api = NumerAPI(secret_key="..", public_id="..") >>> model_id = api.get_models()['uuazed'] >>> api.upload_predictions("prediction.cvs", model_id=model_id) '93c46857-fed9-4594-981e-82db2b358daf' >>> # upload from pandas DataFrame directly: >>> api.upload_predictions(df=predictions_df, model_id=model_id) """ self.logger.info("uploading predictions...") # write the pandas DataFrame as a binary buffer if provided buffer_csv = None if df is not None: buffer_csv = BytesIO(df.to_csv(index=False).encode()) buffer_csv.name = file_path auth_query = ''' query($filename: String! $tournament: Int! $modelId: String) { submission_upload_auth(filename: $filename tournament: $tournament modelId: $modelId) { filename url } } ''' arguments = { 'filename': os.path.basename(file_path), 'tournament': tournament, 'modelId': model_id } submission_resp = self.raw_query(auth_query, arguments, authorization=True) submission_auth = submission_resp['data']['submission_upload_auth'] # get compute id if available and pass it along headers = {"x_compute_id": os.getenv("NUMERAI_COMPUTE_ID")} with open(file_path, 'rb') if df is None else buffer_csv as fh: requests.put(submission_auth['url'], data=fh.read(), headers=headers) create_query = ''' mutation($filename: String! $tournament: Int! $modelId: String $triggerId: String) { create_submission(filename: $filename tournament: $tournament modelId: $modelId triggerId: $triggerId) { id } } ''' arguments = { 'filename': submission_auth['filename'], 'tournament': tournament, 'modelId': model_id, 'triggerId': os.getenv('TRIGGER_ID', None) } create = self.raw_query(create_query, arguments, authorization=True) submission_id = create['data']['create_submission']['id'] return submission_id
def store_csv(df: pd.DataFrame, fpath: str): """ store as csv """ # save as csv fpath = fpath + ".csv" df.to_csv(fpath, index=False)
def test_to_csv_quoting(self): df = DataFrame({ 'c_bool': [True, False], 'c_float': [1.0, 3.2], 'c_int': [42, np.nan], 'c_string': ['a', 'b,c'], }) expected = """\ ,c_bool,c_float,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,"b,c" """ result = df.to_csv() assert result == expected result = df.to_csv(quoting=None) assert result == expected result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected expected = """\ "","c_bool","c_float","c_int","c_string" "0","True","1.0","42.0","a" "1","False","3.2","","b,c" """ result = df.to_csv(quoting=csv.QUOTE_ALL) assert result == expected # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour expected = """\ "","c_bool","c_float","c_int","c_string" 0,True,1.0,42.0,"a" 1,False,3.2,"","b,c" """ result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected msg = "need to escape, but no escapechar set" tm.assert_raises_regex(csv.Error, msg, df.to_csv, quoting=csv.QUOTE_NONE) tm.assert_raises_regex(csv.Error, msg, df.to_csv, quoting=csv.QUOTE_NONE, escapechar=None) expected = """\ ,c_bool,c_float,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,b!,c """ result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='!') assert result == expected expected = """\ ,c_bool,c_ffloat,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,bf,c """ result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='f') assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? text = 'a,b,c\n1,"test \r\n",3\n' df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) assert buf.getvalue() == text # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
def df_to_tsv(df: pd.DataFrame, outfile: str, sep="\t", index=False) -> None: df.to_csv(outfile, sep=sep, index=index)
def load(self, df: pd.DataFrame, output_path: str) -> None: # Export data df.to_csv(output_path, index=False)
def homeSourceFilter(communityName, communityLink): """ // 小区房源基本信息存储list """ addresses = [] floods = [] followInfoes = [] """ // 获取首页 """ res = requests_get(communityLink, timeout=(20, 60)) soup = BeautifulSoup(res.text, 'lxml') """ // 获取分页信息 """ if len(soup.select('div[class="page-box house-lst-page-box"]')) == 0: print("{0}:未找到分页信息".format(communityName)) raise Exception("未找到分页信息") else: page_data = soup.select( 'div[class="page-box house-lst-page-box"]')[0]['page-data'] page_data = eval(page_data) totalPage = page_data["totalPage"] """ // 分页处理 // 涵盖只有单页的情况 """ for pageIndex in range(1, totalPage + 1): if (pageIndex > 1): insertIndex = communityLink.rfind('/', 0, -2) pageLink = communityLink[:insertIndex + 1] + 'pg' + str( pageIndex) + communityLink[insertIndex + 1:] res = requests_get(pageLink, timeout=(20, 60)) soup = BeautifulSoup(res.text, 'lxml') """ // 通过li标签定位房源 // 每个房源有三个class属性存有所需信息: address, flood, followInfo """ houseResourceList = soup.select('li[class="clear"]') for houseResource in houseResourceList: """ // address """ address = houseResource.find_all('div', class_="address") """ // flood """ flood = houseResource.find_all('div', class_="flood") """ // followInfo """ followInfo = houseResource.find_all('div', class_="followInfo") addresses.append("".join(list(address[0].stripped_strings))) floods.append("".join(list(flood[0].stripped_strings))) followInfoes.append("".join(list(followInfo[0].stripped_strings))) allHomeSourceInfo = DataFrame({ "address": addresses, "flood": floods, "followInfo": followInfoes }) allHomeSourceInfo.to_csv(os_getcwd() + "\\" + "AllCommunity\\" + communityName + ".csv", encoding='gbk')
os.makedirs(newpath.rsplit('/',1)[0])#'/'.join(newpath.rsplit('/',2)[0]) + '/') shutil.copy(fullpath, newpath) def copy_bag_file(fullpath): FLY_ID, FMF_TIME, GROUP = parse_fmftime(fullpath.rsplit('/',1)[0]) BAG_FILE = match_fmf_and_bag(FMF_TIME) shutil.copy(BAG_FILE, (POOL_DIR + 'BAGS/')) baglist = [] for bag in glob.glob(SEARCH_DIR + 'BAGS/*.bag'): bagtimestamp = parse_bagtime(bag) baglist.append((bag, bagtimestamp)) bagframe = DataFrame(baglist, columns=['Filepath', 'Timestamp']) bagframe.index = pd.to_datetime(bagframe['Timestamp']) bagframe = bagframe.sort() for matching_dir in glob.glob(SEARCH_DIR + SEARCH_TERM): filelist.append(matching_dir) for fn in find_files(matching_dir, 'frame_by_frame_synced.pickle'): copy_with_dirs(fn) copy_with_dirs(fn.rsplit('/',1)[0] + '/tracking_info.pickle') copy_with_dirs(fn.rsplit('/',1)[0] + '/wingdata.pickle') copy_bag_file(fn) fileDF = DataFrame(filelist) fileDF.to_csv(POOL_DIR + 'filelist.txt', sep='\n', header=None, index=None)
def _4(data: pd.DataFrame) -> (SILVATaxidMapFormat): ff = SILVATaxidMapFormat() with ff.open() as fh: data.to_csv(fh, sep='\t', header=True) return ff
data = DataFrame(columns=columns) for i in Copa_America: url=Copa_America[i][0] source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text) table_body = soup.find('tbody') for j,row in enumerate(table_body.findAll('tr')): td = row.findAll('td') picture = td[0].find('img').get('data-src') pid = td[0].find('img').get('id') nationality = td[1].find('a').get('title') flag_img = td[1].find('img').get('data-src') name = td[1].findAll('a')[1].text age = int(td[2].text.strip()) overall = int(td[3].text.strip()) potential = int(td[4].text.strip()) posicion = td[5].find("span").text.strip() contrato = td[5].find("div").text[-11:] value = td[7].text.strip() wage = td[7].text.strip() special = int(td[10].text.strip()) player_data = DataFrame([[pid, name, age, picture, nationality, flag_img, overall, potential, posicion, contrato, value, wage, special]]) player_data.columns = columns data = data.append(player_data, ignore_index=True) data = data.drop_duplicates() data.to_csv('Copa_america.csv', encoding='utf-8')
def _1(data: pd.DataFrame) -> (SILVATaxonomyFormat): ff = SILVATaxonomyFormat() with ff.open() as fh: data.to_csv(fh, sep='\t', header=False) return ff
csv_name = 'querycsv' startNo = '' endNo = '' singleY = '' for op, value in opts: if op == '-s': startNo = value elif op == '-e': endNo = value elif op == '-y': singleY = value elif op == '-h': historical_data.usage() sys.exit() if debug: print(startNo) if debug: print(endNo) if debug: print(singleY) rs = [] if startNo != '' and endNo != '': rs = historical_data.get_data_indentifier_range(startNo, endNo) csv_name = startNo + '-' + endNo + '-data' elif singleY != '': rs = historical_data.get_one_year_data(singleY) csv_name = singleY + '-data' else: rs = historical_data.get_all_data() csv_name = 'ALL-data' df = DataFrame(rs) df.to_csv('./result/' + csv_name + '.csv', index=False) print(df)