def main(): parser = ArgumentParser( 'analyse_data.py', description="Analyze Jenkins build logs" ) parser.add_argument( '--since', type=dateutil.parser.parse, help="Only consider builds since this date" ) opts = parser.parse_args() builds = load_build_data(since=opts.since) pandas.set_option('expand_frame_repr', False) print("Showing data since: ", opts.since) print("") print_summary_results(builds) print("") print("") build_data = make_subbuild_data_frame(builds) print_top_failing_jobs(build_data) print("") print("") classified_failure_data = get_classified_failures(build_data) print_common_failure_reasons(classified_failure_data) print("") print("") print_common_failure_daily(classified_failure_data) print("") print("") print_commonly_failing_tests(build_data)
def main(argv): pd.set_option('display.width', 200) pd.set_option('display.height', 500) warnings.filterwarnings("ignore") global file_path, RMSLE_scorer # RMSLE_scorer RMSLE_scorer = metrics.make_scorer(RMSLE, greater_is_better = False) if(platform.system() == "Windows"): file_path = 'C:/Python/Others/data/Kaggle/Caterpillar_Tube_Pricing/' else: file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Caterpillar_Tube_Pricing/' ######################################################################################################################## #Read the input file , munging and splitting the data to train and test ######################################################################################################################## Train_DS = pd.read_csv(file_path+'competition_data/train_set.csv',sep=',') Actual_DS = pd.read_csv(file_path+'competition_data/test_set.csv',sep=',') Tube_DS = pd.read_csv(file_path+'competition_data/tube.csv',sep=',') Bill_DS = pd.read_csv(file_path+'competition_data/bill_of_materials.csv',sep=',') Spec_DS = pd.read_csv(file_path+'competition_data/specs.csv',sep=',') Tube_End_DS = pd.read_csv(file_path+'competition_data/tube_end_form.csv',sep=',') Comp_DS = pd.read_csv(file_path+'competition_data/components_2.csv',sep=',') Sample_DS = pd.read_csv(file_path+'sample_submission.csv',sep=',') Train_DS, Actual_DS, y = Data_Munging(Train_DS,Actual_DS,Tube_DS,Bill_DS,Spec_DS,Tube_End_DS, Comp_DS) pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, grid=False)
def manlab_p(df_pkl, rx, col): """ Takes in pkl file, regex query, and column. Reads in pkl file as a pandas df. Subsets the regex query, and manually changes labels to df['col'] based on the post. Resaves the pkl file to update labels. Inputs: - pkl file - regex query - column Outputs: - pandas df """ pd.set_option('max_colwidth' , 200) df = pd.read_pickle(df_pkl) df_res = df[df['post_type']== "responder"] df_lab = df_res[df_res['post'].str.contains(rx , regex = True)] proceed = input("Proceed: ") n = 1 while proceed == 'y': if n%10 == 0: proceed = input("Proceed: ") ind = int(np.random.choice(df_lab.index, size = 1)) label = (input("Personal(0) {} / {}: ".format(n, df['post'].iloc[ind]))) if label == 'x': break label = int(label) df[col][ind] = label n += 1 #safety saving df.to_pickle('df_man.pkl') return df
def main(directory, filename, max_rows, plots, stacked, style_line, style_dots): directory = os.path.expanduser(directory) filename = os.path.join(directory, filename) print("Reading %r" % filename) pd.set_option('display.max_rows', max_rows) df = pd.read_csv(filename) df[COL_T] = pd.to_datetime(df[COL_T]) df = df.set_index(COL_T) print(df) if plots == '': plots = df.columns else: plots = plots.split(',') if stacked: fig, axs = plt.subplots(nrows=len(plots)) for i, plot in enumerate(plots): ax = axs[i] ax.plot(df.index, df[plot], style_line) ax.plot(df.index, df[plot], style_dots) ax.set_xlabel(COL_T) #, fontdict=font) ax.set_ylabel(plot) #ax.set_title(plot) else: ax = df[plots].plot(style=style) plt.show()
def print_table(table, name=None, fmt=None): """Pretty print a pandas DataFrame. Uses HTML output if running inside Jupyter Notebook, otherwise formatted text output. Parameters ---------- table : pandas.Series or pandas.DataFrame Table to pretty-print. name : str, optional Table name to display in upper left corner. fmt : str, optional Formatter to use for displaying table elements. E.g. '{0:.2f}%' for displaying 100 as '100.00%'. Restores original setting after displaying. """ if isinstance(table, pd.Series): table = pd.DataFrame(table) if fmt is not None: prev_option = pd.get_option('display.float_format') pd.set_option('display.float_format', lambda x: fmt.format(x)) if name is not None: table.columns.name = name display(table) if fmt is not None: pd.set_option('display.float_format', prev_option)
def print_full(x): ''' Helper function to plot the *full* dataframe. ''' pd.set_option('display.max_rows', len(x)) print(x) pd.reset_option('display.max_rows')
def buildModel(df): train_y = df['arr_del15'][:train_len] train_x = df[cols][:train_len] # transform categorical features train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0] train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0] train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0] pd.set_option('display.max_rows', 500) print(train_x) # train_x['origin'] = pd.factorize(train_x['origin'])[0] # train_x['dest'] = pd.factorize(train_x['dest'])[0] # print(train_x) train_x = enc.fit_transform(train_x) print(train_x.shape) # Create Random Forest classifier with 50 trees clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clf_rf.fit(train_x.toarray(), train_y) del train_x, train_y print("Model built") return clf_rf
def scrape(): successful_scrapes = 0 # keep track of scrapes that went through for i in range(len(advertisers)): # per advertiser raw_advertiser = advertisers[i].encode( "ascii", "xmlcharrefreplace").lower() print "advertiser raw:", raw_advertiser advertiser = raw_advertiser if "and" in advertiser: advertiser = advertiser.split( "and")[0].strip(" ") # separate multiple # advertisers and search for only the first listed if ".com" in advertiser: advertiser = advertiser.replace( ".com", "") # could prevent search result # get rid of spaces for file name stripped_advertiser = advertiser.replace(" ", "_") advertiser = urllib2.quote(advertiser) # url safe version print "advertiser (url safe):", advertiser print "Working on advertiser {} out of {} ({})".format( i + 1, len(advertisers), raw_advertiser) try: short_url = rstyle_link(advertiser) # get shortlink short_resp = requests.get(short_url) short_data = short_resp.content.encode("utf-8") # generate unicode except Exception as error: print "Error while scraping. No results found for advertiser" continue # nothing else to do without the result try: follow_on_url = re.search('<!-- (.*) -->', short_data).group(1) print "follow on url:", follow_on_url follow_on_resp = requests.get(follow_on_url) print "final follow on url:", follow_on_resp.url follow_on_data = str(follow_on_resp.content) # final html text_cleaner(stripped_advertiser, follow_on_data.decode("utf-8")) print "advertiser {} scraped".format(i + 1) successful_scrapes += 1 # great work - scrape successful except Exception as error: print error print "Done with collecting the job postings!" print "There were", successful_scrapes, "scrapes performed successfully." intermediate_total_skills = fashion_dict overall_total_skills = {} for key in intermediate_total_skills: if intermediate_total_skills[key] > 0: overall_total_skills[key] = intermediate_total_skills[key] final_frame = pd.DataFrame(overall_total_skills.items(), columns=[ "Term", "NumPostings"]) pd.set_option('display.height', 500000) # this is just an arbitrary max pd.set_option('display.max_rows', 500000) final_frame.NumPostings = ( (final_frame.NumPostings)*100)/successful_scrapes final_frame.sort_values(by="NumPostings", ascending=False, inplace=True) import time with open('sites/word_freq_{}.txt'.format(int(time.time())), 'w') as f: f.write(str(final_frame)) # why not write out the analytics to file? return final_frame # End of the function
def main(argv): pd.set_option("display.width", 200) pd.set_option("display.height", 500) warnings.filterwarnings("ignore") global file_path, gini_scorer # Normalized Gini Scorer gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better=True) if platform.system() == "Windows": file_path = "C:/Python/Others/data/Kaggle/Liberty_Mutual_Group/" else: file_path = "/home/roshan/Desktop/DS/Others/data/Kaggle/Liberty_Mutual_Group/" ######################################################################################################################## # Read the input file , munging and splitting the data to train and test ######################################################################################################################## Train_DS = pd.read_csv(file_path + "train.csv", sep=",", index_col=0) Actual_DS = pd.read_csv(file_path + "test.csv", sep=",", index_col=0) Sample_DS = pd.read_csv(file_path + "sample_submission.csv", sep=",") Parms_XGB_DS = pd.read_csv(file_path + "Parms_DS_XGB_1001.csv", sep=",") Parms_RF_DS = pd.read_csv(file_path + "Parms_DS_RF2.csv", sep=",") Train_DS, Actual_DS, y = Data_Munging(Train_DS, Actual_DS) pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, Parms_RF_DS, Grid=False, Ensemble=False)
def summarize_results_basic_size(sim_num, file_name): pd.set_option('display.width', 99999) pd.set_option('display.max_rows', 400) data = pd.read_csv(file_name, sep = '\t') data_summarize = pd.DataFrame() rest_wait_avgs = [] guest_wait_avgs = [] for i in range(sim_num): rest_case_avg = data.iloc[:,5+i*8].sum()/len(data.index) rest_wait_avgs.append(rest_case_avg) guest_case_avg = data.iloc[:,7+i*8].sum()/len(data.index) guest_wait_avgs.append(guest_case_avg) data_summarize['rest_wait_avgs'] = rest_wait_avgs data_summarize['guest_wait_avgs'] = guest_wait_avgs summarize_file_name = "sim_summarize.txt" data_summarize.to_csv(summarize_file_name, sep='\t', header=True, index=False) print rest_wait_avgs print guest_wait_avgs
def main(argv): pd.set_option('display.width', 200) pd.set_option('display.height', 500) warnings.filterwarnings("ignore") global file_path, Train_DS1, Featimp_DS #random.seed(1) if(platform.system() == "Windows"): file_path = 'C:/Python/Others/data/Kaggle/Springleaf_Marketing_Response/' else: file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Springleaf_Marketing_Response/' ######################################################################################################################## #Read the input file , munging and splitting the data to train and test ######################################################################################################################## #Train_DS = pd.read_csv(file_path+'train.csv',sep=',') #Actual_DS = pd.read_csv(file_path+'test.csv',sep=',') Train_DS = pd.read_csv(file_path+'train_25000.csv',sep=',', index_col=0,nrows = 5000 ).reset_index(drop=True) Actual_DS = pd.read_csv(file_path+'test_25000.csv',sep=',', index_col=0,nrows = 5000).reset_index(drop=True) Sample_DS = pd.read_csv(file_path+'sample_submission.csv',sep=',') Filter_DS = pd.read_csv(file_path+'Min_Max_DS_Analysis2.csv',sep=',') Featimp_DS = pd.read_csv(file_path+'feature_imp.csv',sep=',') Train_DS, Actual_DS, y = Data_Munging(Train_DS,Actual_DS, Filter_DS) #pred_Actual = XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid=False) pred_Actual = RFC_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid=False)
def parsefrcschedule(): with open('currentsched.html') as file: htmldata = file.read() #print(htmldata) pandas.set_option('display.width', 1000) eventlist = get_dates(htmldata) print('Data found for', len(eventlist), 'Regional events\n') #pprint(eventlist) regionalLocs = formLocationList(eventlist) print('\nPreparing mileage search for', len(regionalLocs), 'Regional events\n') maprequest = prepmaprequest(home, regionalLocs) dmatrix = getdistancematrix(maprequest) #pprint(dmatrix) print('Merging distance and event information\n') eventlist = mergeEventMilage(eventlist, dmatrix) final = evaluatedates(eventlist) missingevents(eventlist)
def blackbox_method_int(self, output, func_name): """ Helper method to reuse code for testing numpy array outputs from SIP model :param output: String; Pandas Series name (e.g. column name) without '_out' :return: """ try: # display model output in scientific notation pd.set_option('display.float_format','{:.4E}'.format) logging.info('### blackbox out_' + output) logging.info(iec_calc.pd_obj_out) result = iec_calc.pd_obj_out["out_" + output] expected = iec_calc.pd_obj_exp["exp_" + output] tab = pd.concat([result, expected], axis=1) #print(" ") #print(tabulate(tab, headers='keys', tablefmt='fancy_grid')) #npt.assert_array_almost_equal(result, expected, 4, '', True) rtol = 1e-5 npt.assert_allclose(result,expected,rtol,0,'',True) finally: tab = pd.concat([result, expected], axis=1) print("\n") print(func_name) print(tabulate(tab, headers='keys', tablefmt='rst')) return
def response_to_dataframe(resp, reserved, **frame_params): expand_tags = frame_params.pop('expand_tags', True) enc_resp = [] fields = ['tags', 'key'] for el in resp: for field in fields: dictionary = el.pop(field, None) if dictionary is not None: for k, v in dictionary.items(): if (expand_tags and (k in reserved)) or not expand_tags: k = '{}.{}'.format(field, k) el[k] = v if 'date' in el: # Message or Property el['date'] = to_date(el['date']) else: # Entity el['createdDate'] = to_date(el['createdDate']) if 'lastInsertDate' in el: el['lastInsertDate'] = to_date(el['lastInsertDate']) enc_resp.append(el) import pandas as pd pd.set_option("display.expand_frame_repr", False) pd.set_option('max_colwidth', -1) return pd.DataFrame(enc_resp, **frame_params)
def pandas_repr(mat, col_names=None, row_names=None, margin=2, **pd_options): import pandas as pd for k, v in pd_options.items(): k = k.replace('__', '.') pd.set_option(k, v) df = pd.DataFrame(data=mat, index=row_names, columns=col_names) return repr(df).replace('\n', '\n' + ' ' * margin)
def transform(db_table): df = db_table.fillna(0) df.loc[df.country == "Argentina", "country"] = "Argentina + Uruguay" df.loc[df.country == "Uruguay", "country"] = "Argentina + Uruguay" df.loc[df.country == "Malaysia", "country"] = "Malaysia + Singapore" df.loc[df.country == "Singapore", "country"] = "Malaysia + Singapore" combined_countries = df.groupby(["country","report_date"],as_index=False, sort=False).sum() combined_total = df.groupby(["report_date"],as_index=False, sort=False).sum() combined_total["country"] = "Total" daily_report = combined_total.append(combined_countries,True) daily_report["avg_pax/trip"] = daily_report["pax_transported"]/daily_report["trip_realized"] daily_report["Realize_Trip_Yield"] = 100*(daily_report["trip_realized"]/daily_report["trip_offered"]) daily_report["Avg_Realized_Km"] = daily_report["seats_km"]/daily_report["pax_transported"] daily_report["Driver_Cancel_Rate"] = 100*daily_report["driver_cancelation"]/(daily_report["pax_transported"]+daily_report["pax_cancelation"]+daily_report["driver_rejection"]+ daily_report["driver_cancelation"]) daily_report["Pax_Cancel_Rate"] = 100*daily_report["pax_cancelation"]/(daily_report["pax_transported"]+daily_report["pax_cancelation"]+daily_report["driver_rejection"]+ daily_report["driver_cancelation"]) pd.set_option('precision', 2) daily_report = daily_report.fillna(0) from datetime import date, timedelta yesterday = date.today() - timedelta(1) df_sorting = daily_report.loc[daily_report.report_date == yesterday, ["country", "pax_transported"]] mapped_values = df_sorting.set_index('country')['pax_transported'].to_dict() daily_report['sort'] = daily_report['country'].map(lambda x: mapped_values[x]) daily_report.sort(['sort', 'country', 'report_date'], ascending=[0,1,0], inplace=True) return daily_report
def transform(db_table): db_table = db_table.convert_objects(convert_numeric=True) db_table['country'] = db_table['country'].astype(str) db_table['report_date'] = db_table['reference_date'].astype(datetime) db_table.drop('reference_date', axis=1, inplace=True) db_table = db_table.fillna(0) total = db_table.groupby(["report_date"],as_index=False, sort=False).sum() total["id"] = "Total" total["country"] = "Total" daily_report = total.append(db_table,True) daily_report["avg_pax/trip"] = daily_report["pax_transported"]/daily_report["trip_realized"] daily_report["Realize_Trip_Yield"] = 100*(daily_report["trip_realized"]/daily_report["trip_offered"]) daily_report["Avg_Realized_Km"] = daily_report["seats_distance"]/daily_report["pax_transported"] daily_report["Driver_Cancel_Rate"] = 100*daily_report["trip_cancelation"]/(daily_report["pax_transported"]) daily_report["Pax_Cancel_Rate"] = 100*daily_report["booking_cancelation"]/(daily_report["pax_transported"]) pd.set_option('precision', 2) daily_report = daily_report.fillna(0) daily_report.drop('id', axis=1, inplace=True) from datetime import date, timedelta yesterday = date.today() - timedelta(1) df_sorting = daily_report.loc[daily_report.report_date == yesterday, ["country", "pax_transported"]] mapped_values = df_sorting.set_index('country')['pax_transported'].to_dict() daily_report['sort'] = daily_report['country'].map(lambda x: mapped_values[x]) daily_report.sort(['sort', 'country', 'report_date'], ascending=[0,1,0], inplace=True) return daily_report
def main(argv): pd.set_option('display.width', 500) pd.set_option('display.height', 500) pd.options.mode.chained_assignment = None # default='warn' gc.enable() ######################################################################################################################## #Read the input file , munging and splitting the data to train and test ######################################################################################################################## dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d') #train = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/train.csv',sep=',') #actual = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/test.csv',sep=',') #Sample_DS = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/sampleSubmission.csv',sep=',') Keys = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/key.csv',sep=',') Weather = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/weather 3.csv',sep=',') train = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/X_train.csv',sep=',') CV = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/X_CV.csv',sep=',') Y_train = train.units.values Y_CV = CV.units.values X_train = train.drop(['units'], axis=1) X_CV = CV.drop(['units'], axis=1) print(np.shape(X_train)) print(np.shape(X_CV)) p_cv_RFC = RanFst_Regressor(X_train, X_CV, Y_train,Y_CV)
def main(argv): pd.set_option('display.width', 200) pd.set_option('display.height', 500) warnings.filterwarnings("ignore") global file_path, Train_DS1, Featimp_DS #random.seed(1) if(platform.system() == "Windows"): file_path = 'C:/Python/Others/data/Kaggle/Walmart_Recruiting_TTC/' else: file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Walmart_Recruiting_TTC/' ######################################################################################################################## #Read the input file , munging and splitting the data to train and test ######################################################################################################################## #Train_DS = pd.read_csv(file_path+'train.csv',sep=',') #Actual_DS = pd.read_csv(file_path+'test.csv',sep=',') Train_DS = pd.read_csv(file_path+'train_50000.csv',sep=',',index_col=0,nrows = 8000) Actual_DS = pd.read_csv(file_path+'test_50000.csv',sep=',',index_col=0,nrows = 8000) Sample_DS = pd.read_csv(file_path+'sample_submission.csv',sep=',') #For testing only # Train_DS = pd.read_csv(file_path+'train_100000.csv',sep=',', index_col=0,nrows = 1000 ).reset_index(drop=True) # Actual_DS = pd.read_csv(file_path+'test_100000.csv',sep=',', index_col=0,nrows = 1000).reset_index(drop=True) Data_Munging(Train_DS,Actual_DS)
def show_shared_capsules(): guids = 'A|B'.split("|") pandas.set_option('display.width', 5000) pandas.set_option('max_colwidth', 5000) for guid in guids: resp = show_capsules(guid)['status'] header_list = ['vmid', 'role', 'vm_tou', 'type', 'full_access', 'users_full_access', 'roles'] df = pandas.DataFrame(columns=header_list) for res in resp: roles = res['roles'] roles_list = '' for role in roles: roles_list += ' [' + role['guid'] + ' ' + ' | ' + role['role'] + ' | tou:' + str(role['tou']) \ + ' | fa:' + str(role['full_access']) + ' ] ' df2 = pandas.DataFrame([[ res['vmid'], res['role'], str(res['vm_tou']), res['type'], str(res['full_access']) , str(res['user_full_access']), roles_list]], columns=header_list) df = df.append(df2, ignore_index=True) if df.empty != True: print('------------------user:'******'-------------------------') print(df) print('-------------------------------------------------')
def print_full(df): ''' print all rows of pd.DataFrame ''' pd.set_option('display.max_rows', len(df)) print(df) pd.reset_option('display.max_rows')
def main(directory, filename_data_in, max_rows, max_frames, font): logging.basicConfig(level=logging.INFO) pd.set_option('display.max_rows', max_rows) if max_frames == -1: max_frames = None directory = os.path.expanduser(directory) overlay = MyVideoOverlay(directory, filename_data_in, font) overlay.fields = ['frame', 'pos'] #overlay.fields = ['t0', 'pos'] #overlay.data_formatter = DataFormatter() overlay.data_formatter.key_format_default = '%13s' #overlay.data_formatter.d_key_format = { # 'frame': '%s', #} #overlay.data_formatter.value_format_default = '%s' overlay.data_formatter.d_value_format = { 'frame': '%06d', 'pos': '%05.1f', # 't0': '%07.3f' } overlay.create_images(framenumber_max=max_frames)
def transform(table): df = table.fillna(0) df.loc[df.country == "Argentina", "country"] = "Argentina + Uruguay" df.loc[df.country == "Uruguay", "country"] = "Argentina + Uruguay" df.loc[df.country == "Malaysia", "country"] = "Malaysia + Singapore" df.loc[df.country == "Singapore", "country"] = "Malaysia + Singapore" combined_countries = df.groupby(["country","report_reference","reference_number","report_year"],as_index=False, sort=False).sum() combined_total = df.groupby(["report_reference","reference_number","report_year"],as_index=False, sort=False).sum() combined_total["country"] = " Total" report = combined_total.append(combined_countries,True) report["avg_pax/trip"] = report["pax_transported"]/report["trip_realized"] report["Realize_Trip_Yield"] = 100*(report["trip_realized"]/report["trip_offered"]) report["Avg_Realized_Km"] = report["seats_km"]/report["pax_transported"] report["Driver_Cancel_Rate"] = 100*report["driver_cancelation"]/(report["pax_transported"]+report["pax_cancelation"]+report["driver_rejection"]+ report["driver_cancelation"]) report["Pax_Cancel_Rate"] = 100*report["pax_cancelation"]/(report["pax_transported"]+report["pax_cancelation"]+report["driver_rejection"]+ report["driver_cancelation"]) pd.set_option('precision', 2) report = report.fillna(0) report.sort(['country','report_year','reference_number'], ascending=[1,0,0], inplace=True) columns = ['country','report_year','report_reference','new_users','new_drivers','new_passangers','new_trip_offered','new_bookings','ask',\ 'trip_realized', 'unique_trip_realized','pax_transported','unique_pax_driver','avg_pax/trip','Realize_Trip_Yield','Avg_Realized_Km','Driver_Cancel_Rate','Pax_Cancel_Rate'] report = report[columns] return report
def concprinter(df, kind = 'string', n = 100, window = 60, columns = 'all', **kwargs): """ Print conc lines nicely, to string, latex or csv :param df: concordance lines from :class:``corpkit.corpus.Concordance`` :type df: pd.DataFame :param kind: output format :type kind: str ('string'/'latex'/'csv') :param n: Print first n lines only :type n: int/'all' :returns: None """ import corpkit if n > len(df): n = len(df) if not kind.startswith('l') and kind.startswith('c') and kind.startswith('s'): raise ValueError('kind argument must start with "l" (latex), "c" (csv) or "s" (string).') import pandas as pd # shitty thing to hardcode pd.set_option('display.max_colwidth', 100) if type(n) == int: to_show = df.head(nq) elif n is False: to_show = df elif n == 'all': to_show = df else: raise ValueError('n argument "%s" not recognised.' % str(n)) def resize_by_window_size(df, window): df['l'] = df['l'].str.slice(start=-window, stop=None) df['l'] = df['l'].str.rjust(window) df['r'] = df['r'].str.slice(start = 0, stop = window) df['r'] = df['r'].str.ljust(window) df['m'] = df['m'].str.ljust(df['m'].str.len().max()) return df if window: to_show = resize_by_window_size(to_show, window) if columns != 'all': to_show = to_show[columns] if kind.startswith('s'): functi = pd.DataFrame.to_string if kind.startswith('l'): functi = pd.DataFrame.to_latex if kind.startswith('c'): functi = pd.DataFrame.to_csv return_it = kwargs.pop('return_it', False) if return_it: return functi(to_show, header = False, **kwargs) else: print('\n') print(functi(to_show, header = False, **kwargs)) print('\n')
def fun1(): import read_new_nepr_file import os import pandas pandas.set_option('display.width', 200) def get_files_in_folder(path): listing = os.listdir(path) return [os.path.join(path, l) for l in listing if os.path.isfile(os.path.join(path, l))] def get_folders_in_folder(path): listing = os.listdir(path) return [os.path.join(path, l) for l in listing if not os.path.isfile(os.path.join(path, l))] path = 'C:\\Users\\tech5\\Google Drive\\NEPR Actual' file_list = [] for folder in get_folders_in_folder(path): for file in get_files_in_folder(folder): if '(' in file and '.xlsx' in file: file_list.append(file) df = read_new_nepr_file.read_nepr_file(file_list[0]) for f in file_list[1:]: print(f) df = df.append(read_new_nepr_file.read_nepr_file(f), ignore_index=True) return df
def dataset_bucket_analysis_by_field(field): # Set the dataset hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'} Constants.update_properties(hotel_dataset_properties) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Loaded %d records' % len(records)) user_frequency_map = {} for record in records: user_id = record[field] if user_id not in user_frequency_map: user_frequency_map[user_id] = 0 user_frequency_map[user_id] += 1 print('There is a total of %d %ss' % (len(user_frequency_map), field)) sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x[0]) print(sorted_x[1]) print(sorted_x[2]) # print(user_frequency_map) # Number of reviews per user rda = ReviewsDatasetAnalyzer(records) users_summary = rda.summarize_reviews_by_field(field) print('Average number of reviews per %s: %f' % (field, float(rda.num_reviews) / rda.num_users)) users_summary.plot(kind='line', rot=0) pandas.set_option('display.max_rows', len(users_summary)) print(users_summary) pandas.reset_option('display.max_rows')
def process_uploaded_table(table_file, table_name, table_title, table_legend, associated_text): """takes an uploaded data table and associated metadata like legend and title and creates an html data table""" # convert file to pandas data frame pd.set_option('display.max_colwidth', 100) df = pd.read_csv(table_file, prefix = '', encoding = 'utf-8', index_col=False) num_cols = len(df.columns) table_html = df.to_html(index = False,na_rep = '', sparsify = False) # now use beautiful soup to append the table metadata table_soup = BeautifulSoup(table_html) table_tag = table_soup.table title_tag = table_soup.new_tag("caption") title_tag.string = "%s: %s" % (table_name, table_title) table_tag.insert(0, title_tag) table_body_tag = table_soup.find("tbody") legend_str = "%s: %s" % (table_legend, associated_text) footer_tag = BeautifulSoup('<tfoot><tr><td colspan="%d">%s</td></tr></tfoot>' % (num_cols, legend_str)) table_body_tag.insert_after(footer_tag) # iterate through all th tags and check if they contain a string like "Unnamed: 0" and remove thTags = table_soup.findAll('th') for tag in thTags: if 'Unnamed' in tag.string: tag.string = '' return str(table_soup)
def main_report(self, day): ''' dateline=%s" % day ''' pandas.set_option('display.width', 200) d2 = self.mysql.getRecord("select s_code from s_stock_list where dateline=%s" % day) #print d2 #sys.exit() for i in range(0, len(d2)): s_code = d2[i][0] #if s_code != 'sh600000': # continue self._chQ = self.getChuQuan(s_code) #print self._chQ sql_data = "select s_code,code,dateline,chg_m,chg,open,close,high,low,last_close,name FROM s_stock_trade WHERE s_code ='%s' and dateline >20140101 " % s_code print sql_data tmpdf2 = pandas.read_sql(sql_data, self.mysql.db) tmpdf = tmpdf2.apply(self.format_chuquan_hanlder, axis=1) tmpdf.sort_values(by=('dateline'), ascending=False) ma_list = [5, 10, 20, 30, 60] for ma in ma_list: tmpdf['MA_' + str(ma)] = pandas.rolling_mean(tmpdf['close'], ma) last5 = tmpdf.tail(60) #print last5 #sys.exit() for i5 in range(0, len(last5)): if str(last5.iloc[i5].dateline) != day: continue word = s_code[2:] + str(last5.iloc[i5].dateline) if math.isnan(last5.iloc[i5].MA_5): break if math.isnan(last5.iloc[i5].MA_10): break _m60 = last5.iloc[i5].MA_60 if math.isnan(last5.iloc[i5].MA_60): _m60 = 0 else: _m60 = round(_m60, 2) _m30 = last5.iloc[i5].MA_30 if math.isnan(last5.iloc[i5].MA_30): _m30 = 0 else: _m30 = round(_m30, 2) item = {} item['s_code'] = s_code item['dateline'] = last5.iloc[i5].dateline item['hash'] = hashlib.md5(word).hexdigest() item['ma5'] = round(last5.iloc[i5].MA_5, 2) item['ma10'] = round(last5.iloc[i5].MA_10, 2) item['ma20'] = round(last5.iloc[i5].MA_20, 2) item['ma30'] = _m30 item['ma60'] = _m60 self.mysql.dbInsert('s_stock_average', item)
def wrapper(args): try: npar = args.noiseparam.strip("[").strip("]").split(",") except: npar = [] nbins = args.nbins # Run funciton if args.i: df = pd.io.parsers.read_csv(args.i, delim_whitespace=True, dtype={"seqs": str, "batch": int}) else: df = pd.io.parsers.read_csv(sys.stdin, delim_whitespace=True, dtype={"seqs": str, "batch": int}) if len(utils.get_column_headers(df)) > 0: raise SortSeqError("Library already sorted!") model_df = io.load_model(args.model) output_df = main(df, model_df, args.noisemodel, npar, nbins, start=args.start, end=args.end) if args.out: outloc = open(args.out, "w") else: outloc = sys.stdout pd.set_option("max_colwidth", int(1e8)) # Validate dataframe for writting output_df = qc.validate_dataset(output_df, fix=True) io.write(output_df, outloc)
def main(argv): pd.set_option('display.width', 200) pd.set_option('display.height', 500) pd.options.mode.chained_assignment = None # default='warn' ######################################################################################################################## #Read the input file , munging and splitting the data to train and test ######################################################################################################################## train = pd.read_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/train.csv',sep=',') test = pd.read_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/test.csv',sep=',') Sample_DS = pd.read_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/sampleSubmission.csv',sep=',') X,Xt,y = Data_Munging(train,test,Sample_DS) X,Xt,y = Feature_Selection(X,Xt,y) #scores = Kfold_Cross_Valid(X,Xt,y) clf = GridSrch_Modelfit(X,Xt,y,grid=False) #Predict test.csv & reverse the log transform yp=np.exp(clf.predict(Xt)) ######################################################################################################################## #Get the predictions for actual data set ######################################################################################################################## #Get the predictions for actual data set preds = pd.DataFrame(yp, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) preds.to_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/Submission_Roshan.csv', index_label='Id')
import os import glob import json import click import numpy as np import pandas as pd from loguru import logger import mwrvr.constants import mwrvr.misc import mwrvr.textract desired_width = 320 pd.set_option('display.width', desired_width) pd.set_option('display.max_columns', 10) pd.set_option('display.max_rows', 100) INT_COLUMNS = ["score", "kills", "deaths", "plants", "defuses", "top_fragger"] COLUMNS = ["name", "map", "score", "kills", "deaths", "plants", "defuses", "number_of_maps", "top_fragger", "zero_bomb"] PER_MAP_COLUMNS = ["kills_per_map", "deaths_per_map", "plants_per_map", "defuses_per_map", "top_fragger_per_map", "zero_bomb_per_map", "score_per_map"] def find_misspellings(s: str, include_bradlx888_as_ntsfbrad=True): s = mwrvr.misc.find_jaantr(s) s = mwrvr.misc.find_ntsfbrad(s, include_bradlx888=include_bradlx888_as_ntsfbrad)
from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn import tree from sklearn.neural_network import MLPClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB data = pd.read_csv("nursery.csv", sep=';') pd.set_option('display.expand_frame_repr', False) print ("\n **********\n Stampa delle prime 5 righe del dataset: \n *********\n") display(data.head()) print ("\n **********\n Stampa delle statistiche descrittive: \n *********\n") display(data.describe()) print ("\n **********\n Numero totale di dati e di attributi: \n *********\n") print(data.shape) #PREPROCESSING #data= data.convert_objects(convert_numeric=True) # non ci sono dati mancanti. Parò ci sono molti dati # categoriali che vanno trasformati in numerici mediante la discretizzazione. def label_encode(df, columns):
from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier print(cross_val_score(rf_classifier, features, label, cv=20, scoring ='accuracy').mean()) """# Observation 2: Accuracy acquired : 84.2 % for n_splits=20 \\ ## Important Feature Calculation """ rf_classifier.fit(X_train,y_train) df.head(3) pd.set_option("display.max.rows", None) # Important Features feature_importance = pd.DataFrame(rf_classifier.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False) feature_importance # summarize feature importance #for i,v in enumerate(importance): # print('Feature: %0d, Score: %.5f' % (i,v)) """Exporting the results into an excel file for better view""" excel_data = feature_importance.copy() excel_data.to_csv('important_features.csv', index=True)
for c in self.categoric_columns: if c != 'Name' and c != 'RescuerID' and c != 'Description' and c != 'PetID': self.boxplot_for_category_column(self.df, c) def pairplot_for_numeric(self): for c in self.numeric_columns: if c != COL_Y: self.pairplot_for_numeric_column(self.df, c) def boxplot_for_numeric(self): for c in self.numeric_columns: if c != COL_Y: self.boxplot_for_numeric_column(self.df, c) pd.set_option('display.max_rows', 5000) pd.set_option('display.max_columns', 5000) pd.set_option('display.width', 100000) train = Pets(TRAIN_DATA, True, calculation_limit_rows=None) # train.bag_of_words_clean() # train.bag_of_words_prepare() # train.catplot_for_categories() # train.pairplot_for_numeric() # train.boxplot_for_numeric() # train.boxplot_for_categories() # train.train_model() train.roc_curve() # predict = Pets(TEST_DATA, False, calculation_limit_rows=None) # predict.bag_of_words_clean()
def main(argv): args = parse_args(argv) # set up display and plotting options pd.set_option('display.max_columns', 100) pd.set_option('display.max_colwidth', 100) pd.set_option('display.width', get_terminal_size()[1]) sns.set_style('whitegrid') sns.set_context('poster') #sns.set_palette('Set1') if args.out_prefix is None: args.out_prefix = os.path.splitext(args.expt_file)[0] seeds = args.seeds.split(',') folds = args.folds.split(',') # get all training output data from experiment job_files = args.job_script df = read_training_output_files(job_files, args.data_name, seeds, folds, args.iteration, True, args.gen_metrics) if args.test_data is not None: df = df[df['test_data'] == args.test_data] group_cols = ['job_file', 'model_name'] if not args.avg_seeds: group_cols.append('seed') if not args.avg_folds: group_cols.append('fold') if args.avg_iters > 1: df['iteration'] = args.avg_iters * (df['iteration'] // args.avg_iters) group_cols.append('iteration') exclude_cols = [ 'job_file', 'model_name', 'gen_model_name', 'disc_model_name', 'iteration', 'seed', 'fold', 'test_data' ] agg_df = aggregate_data(df, group_cols) #assert all(agg_df['seed'] == set(seeds)) #assert all(agg_df['fold'] == set(folds)) if not args.y: # use all training output metrics args.y = [m for m in agg_df if m not in exclude_cols] if args.scaffold: args.y += [ p + x for p in ['gen_', 'disc_'] for x in ['n_params', 'n_activs', 'size', 'min_width'] ] args.y = sorted(args.y, key=get_y_key, reverse=True) # parse model name to get model params and add columns job_params = add_param_columns(agg_df, scaffold=args.scaffold) print('\nAGGREGATED DATA') print(agg_df) # rename columns if necessary agg_df.reset_index(inplace=True) col_name_map = {col: col for col in agg_df} col_name_map.update(dict(r.split(':') for r in args.rename_col)) agg_df.rename(columns=col_name_map, inplace=True) job_params = {col_name_map[c]: v for c, v in job_params.items()} for y in args.log_y: # add log y columns log_y = 'log({})'.format(y) agg_df[log_y] = agg_df[y].apply(np.log) args.y.append(log_y) if len(args.hue) > 1: # add column for hue tuple hue = add_group_column(agg_df, args.hue) elif len(args.hue) == 1: hue = args.hue[0] else: hue = None # by default, don't make plots for the hue variable or variables with 1 unique value if not args.x: args.x = [ c for c in job_params if c not in exclude_cols and agg_df[c].nunique() > 1 ] args.x = sorted(args.x, key=get_x_key, reverse=True) if args.grouped: # add "all but one" group columns for col in args.x: all_but_col = [c for c in args.x if c not in {col, 'memory'}] add_group_column(agg_df, all_but_col) agg_df.to_csv('{}_agg_data.csv'.format(args.out_prefix)) for y in args.y: z_bounds = get_z_bounds(agg_df[y], args.outlier_z) iqr_bounds = get_iqr_bounds(agg_df[y], args.outlier_iqr) print(y, z_bounds, iqr_bounds) agg_df[y] = remove_outliers(agg_df[y], z_bounds) agg_df[y] = remove_outliers(agg_df[y], iqr_bounds) if args.plot_lines: # plot training progress line_plot_file = '{}_lines.{}'.format(args.out_prefix, args.plot_ext) plot_lines(line_plot_file, agg_df, x=col_name_map['iteration'], y=args.y, hue=None, n_cols=args.n_cols, outlier_z=args.outlier_z, ylim=args.ylim) for hue in args.x + ['model_name']: line_plot_file = '{}_lines_{}.{}'.format(args.out_prefix, hue, args.plot_ext) plot_lines(line_plot_file, agg_df, x=col_name_map['iteration'], y=args.y, hue=hue, n_cols=args.n_cols, outlier_z=args.outlier_z, ylim=args.ylim) if args.iteration: final_df = agg_df.set_index( col_name_map['iteration']).loc[args.iteration] print('\nFINAL DATA') print(final_df) # display names of best models print('\nBEST MODELS') for y in args.y: print( final_df.sort_values(y).loc[:, (col_name_map['model_name'], y)]) #.head(5)) if args.plot_strips: # plot final loss distributions strip_plot_file = '{}_strips.{}'.format(args.out_prefix, args.plot_ext) plot_strips(strip_plot_file, final_df, x=args.x, y=args.y, hue=None, n_cols=args.n_cols, outlier_z=args.outlier_z, ylim=args.ylim) if args.grouped: strip_plot_file = '{}_grouped_strips.{}'.format( args.out_prefix, args.plot_ext) plot_strips(strip_plot_file, final_df, x=args.x, y=args.y, hue=None, grouped=True, n_cols=args.n_cols, outlier_z=args.outlier_z, ylim=args.ylim) for hue in args.x + ['model_name']: strip_plot_file = '{}_strips_{}.{}'.format( args.out_prefix, hue, args.plot_ext) plot_strips(strip_plot_file, final_df, x=args.x, y=args.y, hue=hue, n_cols=args.n_cols, outlier_z=args.outlier_z, ylim=args.ylim) if args.plot_corr: corr_y = [y for y in args.y if final_df[y].nunique() > 1] corr_plot_file = '{}_corr.{}'.format(args.out_prefix, args.plot_ext) plot_corr(corr_plot_file, final_df, x=corr_y, y=corr_y) for hue in args.x + ['model_name']: corr_plot_file = '{}_corr_{}.{}'.format( args.out_prefix, hue, args.plot_ext) plot_corr(corr_plot_file, final_df, x=corr_y, y=corr_y, hue=hue)
# ## 1 - [Data cleaning](#ch1) # <a id="ch0"></a> # ## 0 - Import libraries and files # In[1]: import pandas as pd import datetime as dt import seaborn as sns import matplotlib.pyplot as plt # In[2]: data = pd.read_csv("energie_mensuel.csv", sep=';') pd.set_option('display.max_columns', None) data = data.replace('È', 'é', regex=True) # Les "é" sont écrits comme des "È" dans le # fichier initial data.head(3) # <a id="ch1"></a> # ## 1 - Data cleaning # In[3]: # Keep "France" rows data = data.loc[data.Territoire == "France"] data = data[["Mois", "Consommation totale"]] data.columns = ["date", "consumption"]
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # @Date : 2018-10-13 14:33:49 # @Author : Michael ([email protected]) import pandas as pd pd.set_option('expand_frame_repr', False) # pd.set_option('diaplay.max_rows', 1000) def transfer_to_period_data(df, rule_type='15T'): """ 将数据转换为其它周期的数据 :param df: :param rule_type: :return: """ # ===转换为其它分钟数据 period_df = df.resample(rule=rule_type, on='candle_begin_time', label='left', closed='left').agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }) period_df.dropna(subset=['open'], inplace=True)
#Convert the format so that they can be pushed to the SQL people_tuple = [tuple(i) for i in people.to_numpy()] friend_tuple = [ tuple(map(int, i[0].split(" "))) for i in friend.values.tolist() ] # Connecting to the database file conn = sqlite3.connect('p2.db') c = conn.cursor() #Creating table for people/pushing the data into the table c.execute("DROP TABLE IF EXISTS people") c.execute('''CREATE TABLE people (personId INTEGER,name text)''') c.executemany('INSERT INTO people VALUES (?,?)', people_tuple) #Creating table for friends/pushing the data into the table c.execute("DROP TABLE IF EXISTS friends") c.execute('''CREATE TABLE friends (personId1 INTEGER,personId2 INTEGER)''') c.executemany('INSERT INTO friends VALUES (?,?)', friend_tuple) #SQL Query that querys NumOfFriends pd.set_option('display.max_rows', None) print( pd.read_sql_query( "Select Name as Name, Count(PersonId1) as NumOfFriends From (Select personId1 From friends UNION ALL Select PersonId2 From friends) As P JOIN people on P.personId1=people.PersonId Group by Name Order by NumOfFriends DESC", conn)) #Committing changes and closing the connection to the database file conn.commit() conn.close()
# -*- coding: utf-8 -*- """ @Time : 2017/6/3 - 15:02 @Auther : Hao Chen """ import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score if __name__ == "__main__": pd.set_option('display.width', 300) data = pd.read_csv('../dataset/tel.csv', skipinitialspace=True, thousands=',') # thousands : str, default None 千分位分割符,如“,”或者“." print u'原始数据:\n', data.head(10) # print 'data.columns() = \n', data.columns # 将每列数据按照类别做Label,比如Married和Unmarried这两个值分别用0和1取代 le = LabelEncoder() # 编码标签值介于0和n 比如有5类则标签为0/1/2/3/4 for col in data.columns: data[col] = le.fit_transform(data[col]) # 符合标签编码的返回编码标签 print u'处理后数据1:\n', data.head(10) # 年龄分组 # 将age这列的数据按照给定的bins半开区间做标记,比如年龄在[-1,6)标记为0;[6,12)标记为1;[12,18)标记为2 ;这里标记可以自己指定,但要和bins的取值个数一样
# -*- coding:utf-8 -*- # @FileName :floyd.py # @Time :2021/6/2 22:24 # @Author :zyt import numpy as np import pandas as pd pd.set_option('display.max_columns', None) # 展示所有列 pd.set_option('display.max_rows', None) # 展示所有行 def read_data(filepath: str): """读取数据""" station_df = pd.read_excel(filepath, sheet_name='station', usecols='A: C', index_col='code') arcs_df = pd.read_excel(filepath, sheet_name='arcs', usecols='A: J') arcs_df = arcs_df[arcs_df['type'] == 'real'].set_index(['point_up_code', 'point_down_code'], drop=False) arcs_df['price/arc'] = arcs_df['mileage'] * arcs_df['price'] return station_df, arcs_df def build_matrix(station_df, arcs_df): """构建路径长度矩阵""" # 初始化里程矩阵mileage_matrix和路径矩阵path_matrix node_num = len(station_df) matrix_index = list(station_df.index) mileage_matrix = pd.DataFrame(np.full((node_num, node_num), np.inf), index=matrix_index, columns=matrix_index) path_matrix = pd.DataFrame(np.full((node_num, node_num), '-'), index=matrix_index, columns=matrix_index) # 更新里程矩阵mileage_matrix for index, arc in arcs_df.iterrows(): # 索引是路径的(起点,终点)
#!/usr/bin/env python3 ################################################################################################################# ## This script creates HDF5 files from corresponding SQLite db files in one-to-one fashion. So, for each db file ## present in the input folder, equivalent HDF5 file is created in the output folder. To combine db files into ## a single HDF5 file , use version 1 of script named db_hdf5_v1.py instead ################################################################################################################# import sqlite3, sys, glob, os, argparse, errno import pandas as pd from glob import glob as g pd.set_option('io.hdf.default_format','table') # Commenting this line out will write HDF5 as a fixed format, and not as a table format # Writing as a fixed format is faster than writing as a table, but the file cannot be 'modified/appended to' later on DB_SUFFIX = '.db' # Function to check for existing directories, and create a new one if not present def dir_check(d): if os.path.exists(d): reply = input("Specified output directory already exists!! Delete existing directory named <<"+os.path.basename(d)+">> and all its contents? [y/n] ") if reply in ['y', 'Y', 'yes']: try: os.system('rm -r '+ d) print("Directory named <<"+os.path.basename(d)+ ">> and all its contents deleted!!") # Make new output folder try: os.makedirs(d) except OSError as exception: if exception.errno != errno.EEXIST: raise except: error("- Could not delete directory <<" +os.path.basename(d)+">>. Directory may contain additional files, remove files manually and try again!")
def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame: """ Adds several different TA indicators to the given DataFrame Performance Note: For the best performance be frugal on the number of indicators you are using. Let uncomment only the indicator you are using in your strategies or your hyperopt configuration, otherwise you will waste your memory and CPU usage. :param dataframe: Dataframe with data from the exchange :param metadata: Additional information, like the currently traded pair :return: a Dataframe with all mandatory indicators for the strategies """ informative_time_frame = '1d' informative = None if not self.dp: # Don't do anything if DataProvider is not available. return dataframe if self.dp: if self.dp.runmode.value in ('live', 'dry_run'): now = datetime.utcnow() time = pd.Timestamp(year=now.year, month=now.month, day=now.day, tz="GMT+0") ticker = self.dp.ticker(metadata['pair']) new_row = { 'date': time, 'open': 1, 'high': 1, 'low': 1, 'close': ticker['last'], 'volume': 1 } # Get the informative pair informative = self.dp.get_pair_dataframe( pair=metadata['pair'], timeframe=informative_time_frame) informative = informative.append(new_row, ignore_index=True) # if not informative: # return dataframe # calculate the bollinger bands with 1d candles bollinger = qtpylib.bollinger_bands(informative['close'], window=3, stds=1) informative[f'bb_lowerband1'] = bollinger['lower'] informative[f'bb_middleband1'] = bollinger['mid'] informative[f'bb_upperband1'] = bollinger['upper'] # Rename columns to be unique # Assuming inf_tf = '1d' - then the columns will now be: # date_1d, open_1d, high_1d, low_1d, close_1d informative.columns = [ f"{col}_{informative_time_frame}" for col in informative.columns ] # sync up dates # informative[f'date_{informative_time_frame}'] = pd.to_datetime(informative[f'date_{informative_time_frame}'], utc=True) # dataframe['date'] = pd.to_datetime(dataframe['date'], utc=True) pd.set_option('display.max_columns', None) pd.set_option('display.width', 300) logger.info( f'---------Informative Pair: {metadata["pair"]}-------------------' ) path = os.path.normpath( os.path.abspath( os.path.join( os.getcwd(), 'user_data', f"dataframe_{metadata['pair'].replace('/', '')}.csv"))) logger.info(path) file = open(path, "w") file.write(dataframe.to_csv()) file.close() logger.info(f'\n\n{informative.to_markdown()}') # Combine the 2 dataframes # all indicators on the informative sample MUST be calculated before this point dataframe = dataframe.merge(informative, left_on='date', right_on=f'date_{informative_time_frame}', how='left') # FFill to have the 1d value available in every row throughout the day. # Without this, comparisons would only work once per day. # dataframe = dataframe.ffill() pd.set_option('display.max_columns', None) pd.set_option('display.width', 300) logger.info( f'---------Dataframe Pair: {metadata["pair"]}-------------------') logger.info(f'\n\n{dataframe.to_markdown()}') return dataframe
# mpl.rcParams['font.family'] = "serif" # mpl.rcParams['font.serif'] = "cm" mpl.rcParams[ "text.latex.preamble"] = r"\usepackage{subdepth}, \usepackage{type1cm}" # PANDAS # ====== try: import pandas as pd _pandas_present = True except ImportError: _pandas_present = False if _pandas_present and ipython: pd.set_option("display.latex.repr", False) pd.set_option("display.latex.longtable", False) pd.set_option("display.latex.escape", False) # SYMPY # ===== try: import sympy as sym _sympy_present = True except ImportError: _sympy_present = False if _sympy_present: sym.init_printing(use_latex=True) # IMAGE ARRANGEMENT with PIL
""" import json from collections import OrderedDict, defaultdict from django.http import JsonResponse from django.shortcuts import get_object_or_404 from otree.views.admin import SessionData, SessionDataAjax from otree import export from otree.common import get_models_module from otree.db.models import Model from otree.models.participant import Participant from otree.models.session import Session import pandas as pd pd.set_option('display.max_columns', 100) pd.set_option('display.width', 180) #%% helper functions def _rows_per_key_from_queryset(qs, key): """Make a dict with `row[key] -> [rows with same key]` mapping (rows is a list).""" res = defaultdict(list) for row in qs.values(): res[row[key]].append(row) return res
import altair as alt import pandas as pd from infra.constants import MAX_DATE, MIN_DATE import infra.parsers import infra.dask import infra.pd # Module specific format options pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', None) pd.set_option('display.width', None) pd.set_option('display.max_rows', None) def compute_user_currency_histories(): """Compute the normalized ledger with running user currency balance. """ # Extract data from the transactions file into a resolved pandas frame # Importantly, use the timezone adjusted log but NOT the trimmed log to # avoid clipping state from early users. transactions = infra.dask.read_parquet( "data/internal/transactions_TZ" ).compute() # Split transfers into positive components for the dest and negative for # the source transfers = transactions.loc[ (transactions["kind"] == "user_transfer") | (transactions["kind"] == "admin_transfer")
# -*- coding: utf-8 -*- # @Author : AlwaysDazz # @Time : 2021/5/23 14:13 # @IDE: : PyCharm # @Project : pythonProject # @Comment :随机森林决策树,知乎:https://zhuanlan.zhihu.com/p/58945933 from sklearn.ensemble import RandomForestClassifier as rdclf #sklearn的集成算法都在ensemble模块中,我们导入随机森林算法 from sklearn.tree import DecisionTreeClassifier as clf #决策树 from sklearn.datasets import load_wine #导入红酒数据集 from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt #导入画图工具 from sklearn.model_selection import cross_val_score as cvs import pandas as pd pd.set_option('display.max_columns', 1000) #显示输出栏最大列数 pd.set_option('display.width', 1000) #显示输出栏最宽行数 pd.set_option('display.max_colwidth', 1000) #显示输出栏最大列宽 winedata = load_wine().data #红酒数据 winetarget = load_wine().target #红酒标签 # Xtrain,Xtest,Ytrain,Ytest=train_test_split(winedata,winetarget,test_size=0.3) #测试 训练3 7分 # clf=clf(random_state=0)#实例化决策树和随机森林 # rdclf=rdclf(random_state=0) # clf=clf.fit(Xtrain,Ytrain) #将决策树与随机森林进行训练 # rdclf=rdclf.fit(Xtrain,Ytrain) # clf_t=clf.score(Xtest,Ytest) #将训练好的模型进行测试 # rdclf_t=rdclf.score(Xtest,Ytest) # print('Single Tree:{}'.format(clf_t),'Random Forest Tree:{}'.format(rdclf_t)) #格式化输出Single Tree:0.9259259259259259 Random Forest Tree:0.9814814814814815 #使用交叉验证法 实例化决策树和随机森林, 自动划分测试集和训练集
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import set_option import os set_option('display.width', 2000) pd.set_option("display.max_rows", 500, "display.max_columns", 2000) set_option('precision', 3) #set_option("display.max_rows", 10) pd.options.mode.chained_assignment = None input_file = './raw data_edit/data_merge.csv' # The well name of an input file data_input_ori = pd.read_csv(input_file) j=0 for i in range(data_input_ori.shape[0]): sumbox_clc = data_input_ori.commun[i] + data_input_ori.homehobb[i] + data_input_ori.judgment[i] + \ data_input_ori.memory[i] + data_input_ori.orient[i] + data_input_ori.perscare[i] if sumbox_clc != data_input_ori.sumbox[i]: #print(sumbox_clc, data_input_ori.sumbox[i]) data_input_ori.sumbox[i] = sumbox_clc j = j+1 print(j) #data_input = data_input_ori[(data_input_ori.cdr < 1.0) | (data_input_ori.sumbox > 2.0)] data_input = data_input_ori M, N = data_input.shape print(M, N) keys = data_input.keys() print(keys)
def HorseForm(SSOID, BestOrWorst, placeBets, SelIndex): Rating = float(0) Index = float(0) if (BestOrWorst == "Best"): FormRatingAvg = float(100) else: FormRatingAvg = float(0) FormRatingList = [] FormRatingListSort = [] FormRatingEndList = [] FormList = [] FormEndList = [] horsename = [] selectionID = [] eventTypeID = '["7"]' #ID for Horse Racing countryCode = '["GB","IE"]' #Country Codes. Betfair use Alpha-2 Codes under ISO 3166-1 marketTypeCode = '["WIN"]' #Market Type MarketStartTime = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%SZ') #Event Start and End times MarketEndTime = (datetime.datetime.now() + datetime.timedelta(hours=24)) MarketEndTime = MarketEndTime.strftime('%Y-%m-%dT%H:%M:%SZ') maxResults = str(1000) sortType = 'FIRST_TO_START' #Sorts the Output Metadata = 'RUNNER_METADATA' #Provides metadata inplay = 'false' #still to run priceProjection = '["EX_BEST_OFFERS"]' #Best odds #Create an empty dataframe d = { 'Horse Name': [], 'Horse Id': [], 'Form': [], 'Race': [], 'Time': [], 'Venue': [], 'MarketId': [], 'Odds': [], 'Bet Placed': [] } pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', None) pd.set_option('expand_frame_repr', False) Results = pd.DataFrame(data=d) headers = { 'X-Application': my_app_key, 'X-Authentication': SSOID, 'content-type': 'application/json' } user_req = '{"jsonrpc": "2.0", "method": "SportsAPING/v1.0/listMarketCatalogue",\ "params": {"filter":{"eventTypeIds":' + eventTypeID + ',"marketTypeCodes":' + marketTypeCode + ',\ "inPlayOnly":' + inplay + ', "marketCountries":' + countryCode + ',\ "marketStartTime":{"from":"' + MarketStartTime + '", "to":"' + MarketEndTime + '"}},\ "sort":"' + sortType + '", "maxResults":"' + maxResults + '", "marketProjection":["' + Metadata + '","MARKET_START_TIME","EVENT"]}, "id": 1}' #print (user_req) req = urllib.request.Request(bet_url, data=user_req.encode('utf-8'), headers=headers) response = urllib.request.urlopen(req) jsonResponse = response.read() pkg = jsonResponse.decode('utf-8') result = json.loads(pkg) marketCatelogue = result['result'] for x in range(len(marketCatelogue)): for w in range(len(marketCatelogue[x]['runners'])): runnerform = marketCatelogue[x]['runners'][w]['metadata']['FORM'] if runnerform is None: runnerform = 'e' runnerformrev = runnerform[::-1] runnerformList = list(runnerformrev) Index = float(0) Rating = float(0) factor = 4 for Entry in runnerformList: if (factor > 1): factor = factor - 1 if Entry == 'R': #refusal to jump hurdle Rating = float(Rating) + (float(5) * float(factor)) Index = Index + factor elif Entry == 'e': #First Race Rating = float(Rating) + (float(10) * float(factor)) Index = Index + factor elif Entry == '0': #finished higher than 9th Rating = float(Rating) + (float(10) * float(factor)) Index = Index + factor elif Entry == 'F': #fell Rating = float(Rating) + (float(5) * float(factor)) Index = Index + factor elif Entry == 'U': #unseated rider Rating = float(Rating) + (float(3) * float(factor)) Index = Index + factor elif Entry == 'x': #horse has not started in a race for 3 months or more Rating = float(Rating) + (float(3) * float(factor)) Index = Index + factor elif Entry == 'C': #horse has won before at this same race distance and track. Rating = float(Rating) + (float(.5) * float(factor)) Index = Index + factor elif Entry == 'B': #horse started favorite at it's last start, but it did not win Rating = float(Rating) + (float(3.5) * float(factor)) Index = Index + factor elif Entry == '/': #represents two seasons ago #Rating = float(Rating) + (float(8) * float(factor)) #Index = Index + factor Index = Index elif Entry == '-': #represents one season ago #Rating = float(Rating) + (float(4) * float(factor)) Index = Index elif Entry == 'P': #pulled up by jockey Rating = float(Rating) + (float(4) * float(factor)) Index = Index + factor elif Entry == 'S': #horse slipped up Rating = float(Rating) + (float(4) * float(factor)) Index = Index + factor elif Entry == 'C': #horse carried offcourse Rating = float(Rating) + (float(4) * float(factor)) Index = Index + factor elif Entry == 'O': #horse ran offcourse Rating = float(Rating) + (float(10) * float(factor)) Index = Index + factor elif Entry == 'D': #horse disqualified Rating = float(Rating) + (float(7) * float(factor)) Index = Index + factor else: try: Rating = float(Rating) + (float(Entry) * float(factor)) except: Rating = float(Rating) + (float(5) * float(factor)) Index = Index + factor rating = float(Rating) / float(Index) FormList.append(runnerform) FormRatingList.append(rating) FormRatingListSort.append(rating) FormRatingListSort.sort() for zz in range(len(FormRatingListSort)): for t in range(len(FormRatingList)): if FormRatingList[t] == FormRatingListSort[zz]: horsename.append( marketCatelogue[x]['runners'][t]['runnerName']) selectionID.append( marketCatelogue[x]['runners'][t]['selectionId']) FormRatingEndList.append(str(FormRatingList[t])) FormEndList.append(FormList[t]) try: price_req = '{"jsonrpc": "2.0", "method": "SportsAPING/v1.0/listRunnerBook", "params": {"locale":"en", \ "marketId":"' + str(marketCatelogue[x]['marketId']) + '",\ "selectionId":"' + str(selectionID[SelIndex]) + '",\ "priceProjection":{"priceData":' + priceProjection + '},"orderProjection":"ALL"},"id":1}' #print (price_req) req = urllib.request.Request(bet_url, data=price_req.encode('utf-8'), headers=headers) price_response = urllib.request.urlopen(req) price_jsonResponse = price_response.read() price_pkg = price_jsonResponse.decode('utf-8') price_result = json.loads(price_pkg) #print (price_result) #print (horsename) start_time = marketCatelogue[x]['marketStartTime'] my_datetime = datetime.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S.000Z') StartTime = my_datetime.strftime('%H:%M') venue = marketCatelogue[x]['event']['venue'] price = float(price_result['result'][0]['runners'][0]['ex'] ['availableToLay'][0]['price']) marketId = str(marketCatelogue[x]['marketId']) horseId = str(selectionID[SelIndex]) if ((price < 10.0) and (placeBets == "y")): PlaceBet(SSOID, marketId, horseId, str(price), "2") betPlaced = CheckBet(SSOID, marketId) Results = Results.append( { 'Horse Name': str(horsename[SelIndex]), 'Horse Id': str(selectionID[SelIndex]), 'Form': str(FormEndList[SelIndex]), 'Race': str(marketCatelogue[x]['marketName']), 'Time': str(StartTime), 'Venue': str(venue), 'MarketId': str(marketCatelogue[x]['marketId']), 'Odds': str(price_result['result'][0]['runners'][0]['ex'] ['availableToLay'][0]['price']), 'Bet Placed': betPlaced }, ignore_index=True) except: pass #print ("Got an error") Rating = float(0) Index = float(0) if (BestOrWorst == "Best"): FormRatingAvg = float(100) else: FormRatingAvg = float(0) FormRatingList.clear() FormRatingListSort.clear() FormList.clear() FormRatingEndList.clear() FormEndList.clear() horsename.clear() selectionID.clear() return Results
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.preprocessing import normalize from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import classification_report,confusion_matrix,accuracy_score from sklearn.linear_model import LogisticRegression from sklearn import svm, tree, decomposition, ensemble, preprocessing from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import KNeighborsClassifier from sklearn.decomposition import TruncatedSVD, PCA from sklearn.random_projection import sparse_random_matrix from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2,mutual_info_classif,RFE from cross_validation import KFoldCrossValidation from naive_bayes import BernoulliNaiveBayes # nltk.download('wordnet') pd.set_option('display.max_columns', None) pd.set_option("display.precision", 3) data_train = pd.read_csv('./data/reddit_train.csv') train_com = data_train.comments label = data_train.subreddits stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words('english') # ############################################################################################## # # Noise Reduction # ##############################################################################################
import pickle as pk from sklearn.metrics import confusion_matrix, accuracy_score,average_precision_score,classification_report,f1_score #import urllib.parse from werkzeug.utils import secure_filename import matplotlib.pyplot as plt import time from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer import seaborn as sns from flask_restful import Api,Resource from flask import jsonify import json UPLOAD_FOLDER = './uploads_f/'#'/uploads_f' ALLOWED_EXTENSIONS = set(['tsv','csv']) pd.set_option('display.max_colwidth', -1) app = Flask(__name__) api=Api(app) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER @app.route('/predictsap', methods=["GET","POST","PUT"]) def summary(): if request.method == "POST": if request.get_json(): req = request.get_json() dataframe = pd.DataFrame.from_dict(req, orient="index") print(dataframe) dataframe['Debit_CreditInd.']=dataframe['Debit_CreditInd.'].apply(lambda x: 1 if x == 'S' else 0)
# coding: utf-8 # # Data types and missing data reference # # This is the reference section of the "Data types and missing data" section of the tutorial. For the workbook, click [here](https://www.kaggle.com/residentmario/data-types-and-missing-data-workbook). # # In this short section we will look at two inter-related concepts, data types and missing data. This section draws from the [Intro to data structures](https://pandas.pydata.org/pandas-docs/stable/dsintro.html) and [Working with missing data](https://pandas.pydata.org/pandas-docs/stable/missing_data.html) sections of the comprehensive official tutorial. # In[ ]: import pandas as pd reviews = pd.read_csv("../input/winemag-data-130k-v2.csv", index_col=0) pd.set_option('max_rows', 5) # ## Data types # # The data type for a column in a `DataFrame` or a `Series` is known as the `dtype`. # # You can use the `dtype` property to grab the type of a specific column: # In[ ]: reviews.price.dtype # Alternatively, the `dtypes` property returns the `dtype` of _every_ column in the dataset:
import random import os import pandas as pd from logbook import TestHandler from pandas.util.testing import assert_frame_equal from catalyst import get_calendar from catalyst.exchange.exchange_asset_finder import ExchangeAssetFinder from catalyst.exchange.exchange_data_portal import DataPortalExchangeBacktest from catalyst.exchange.utils.exchange_utils import get_candles_df from catalyst.exchange.utils.factory import get_exchange from catalyst.exchange.utils.test_utils import output_df, \ select_random_assets pd.set_option('display.expand_frame_repr', False) pd.set_option('precision', 8) pd.set_option('display.width', 1000) pd.set_option('display.max_colwidth', 1000) class TestSuiteBundle: @staticmethod def get_data_portal(exchanges): open_calendar = get_calendar('OPEN') asset_finder = ExchangeAssetFinder(exchanges) exchange_names = [exchange.name for exchange in exchanges] data_portal = DataPortalExchangeBacktest( exchange_names=exchange_names, asset_finder=asset_finder,
import pandas as pd from scipy.spatial.distance import euclidean import numpy as np from src.data_labels import APARTMENT_WALKS_BAD, APARTMENT_WALKS_IBO, APARTMENT_WALKS_RICCI from src.fast_dtw import dtw_plot from fastdtw import fastdtw from src.sensor import SensorData import matplotlib.pyplot as plt import os PANDAS_WIDTH = 150 pd.set_option('display.width', PANDAS_WIDTH) if __name__ == '__main__': data_sets = [ APARTMENT_WALKS_BAD, APARTMENT_WALKS_RICCI, APARTMENT_WALKS_IBO ] trainings = [] tests = [] # populate trainings and tests with our files for chosen_folder in data_sets: files = os.listdir(chosen_folder) get_test = True for csv_file in files: name, ext = os.path.splitext(csv_file) if ext != '.csv': continue csv_file = os.path.join(chosen_folder, csv_file) sensordata = SensorData(csv_file) if get_test: print 'Added to Test', name
between='fb_type', padjust='fdr_bh') fig, axes = plt.subplots(2, 2, figsize=(9, 4)) metric_types = ['magnitude', 'n_spindles', 'amplitude', 'duration'] p_all = np.zeros((4, 4)) for j_metric_type, metric_type in enumerate(metric_types): df_metric_type = stats_df_all.query( 'metric_type=="{}"'.format(metric_type)) for j_fb_type, fb_type in enumerate(fb_types): ax = axes[j_metric_type // 2, j_metric_type % 2] df = df_metric_type.query('fb_type=="{}"'.format(fb_type)) pd.set_option('display.max_columns', 500) res = ttest(df.query('baseline=="After"')['metric'], df.query('baseline=="Before"')['metric'], paired=True) # res = pairwise_ttests(df, dv='metric', within='baseline', subject='subj_id') p = res['p-val'].values[0] p_all[j_fb_type, j_metric_type] = p res_str = '$p_u$={:.3f}\n'.format( p) + r'$Diff_{CI95}$=' + '[{}, {}]'.format(*res['CI95%'].values[0]) x_before = df.query('baseline=="Before"')['metric'].values x_after = df.query('baseline=="After"')['metric'].values for j in range(len(x_before)): pair = np.array([x_before[j], x_after[j]]) ax.plot(np.array([0, 2]) + 3 * j_fb_type, pair,
def train_predict(self, data, time_budget,n_class,schema): s1 = time.time() seed = SEED fix_seed(seed) LOGGER.info(f'time_budget:{time_budget}') LOGGER.info(f'n_class:{n_class}') LOGGER.info(f'node:{data["fea_table"].shape[0]}') LOGGER.info(f'edge:{data["edge_file"].shape[0]}') #pre-process data process_data = ProcessData(data) table = process_data.pre_process(time_budget,n_class,schema) # Feature Dimension Reduction feat = Feat() process_data.drop_unique_columns(table) drop_sum_columns = process_data.drop_excessive_columns(table) feat.fit_transform(table,drop_sum_columns) LOGGER.info(f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}') #这里好像没用到哦 table.large_features = False if table.ori_columns.shape[0]>500: table.large_features = True model_type_list = ['sage','gat','tagc','gcn'] repeat = 5 model_name_list = [f'{model_type_list[i]}{i+len(model_type_list)*j}' for j in range(repeat) for i in range(len(model_type_list))] model_type_list = model_type_list*repeat LOGGER.info('use node embedding') categories = ['node_index','degree_bins','bin_2-neighbor_mean_degree_bins'] for model in set(model_type_list): LOGGER.info(f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}""") exec(f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)') allmodel = AllModel() table.lr_epoch = 16 table.lr_list = [0.05,0.03,0.01,0.0075,0.005,0.003,0.001,0.0005] train_valid_idx_list,valid_idx_list = split_train_and_valid(table,train_rate=0.8,seed=SEED,mode=split_mode) train_idx,test_idx = split_train_and_test(table) test_idx = test_idx.sort_values() run_model = [] run_type = [] run_time = {} for i in range(len(model_type_list)): seed = SEED*(i+1) fix_seed(seed) model_type = model_type_list[i] model_name = model_name_list[i] if model_type not in run_time: init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time(table, model_type, model_name, train_idx,test_idx, seed=seed) run_lr_time = len(table.lr_list)*(init_time+table.lr_epoch*one_epoch_time) run_time500 = init_time*(2)+one_epoch_time*(500+early_stopping_rounds)*2+run_lr_time run_time300 = init_time*(2)+one_epoch_time*(300+early_stopping_rounds)*2+run_lr_time run_time150 = init_time*(2)+one_epoch_time*(150+early_stopping_rounds)*2+run_lr_time run_time[model_type] = (run_time500-run_lr_time,run_time300-run_lr_time,run_time150-run_lr_time,early_stopping_rounds,init_time,one_epoch_time,run_lr_time) else: run_time500,run_time300,run_time150,early_stopping_rounds,init_time,one_epoch_time,run_lr_time = run_time[model_type] s2 = time.time() LOGGER.info(f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s") if s2-s1+run_time500+5<time_budget: LOGGER.info('train 500 epoch') allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=500,seed=seed) run_model.append(model_name) run_type.append(model_type) elif s2-s1+run_time300+5<time_budget: LOGGER.info('train 300 epoch') allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=300,seed=seed) run_model.append(model_name) run_type.append(model_type) elif s2-s1+run_time150+5<time_budget: LOGGER.info('train 150 epoch') allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=150,seed=seed) run_model.append(model_name) run_type.append(model_type) elif len(allmodel.valid_models[0])==0: this_epoch = int(((time_budget-(s2-s1+5)-run_lr_time)/2-init_time)/(one_epoch_time)-early_stopping_rounds) LOGGER.info(f'short time train {this_epoch} epoch') allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=this_epoch,seed=seed) run_model.append(model_name) run_type.append(model_type) elif time_budget-(s2-s1)<5: LOGGER.info('never train; break') break else: LOGGER.info('no train this model; continue') continue if offline: if table.especial: df = table.df[['node_index','is_test']] df = df.merge(data['test_label'],how='left',on='node_index') test_label = df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'label'].astype('int').values else: test_label = data['test_label']['label'].values else: test_label = None preds1,valid_acc1 = get_preds(0,run_model,run_type,allmodel,model_name_list,table,test_label,valid_idx_list) preds2,valid_acc2 = get_preds(1,run_model,run_type,allmodel,model_name_list,table,test_label,valid_idx_list) preds = (preds1+preds2)/2 preds = preds.argmax(axis=1).flatten() if table.especial: LOGGER.info(f'preds\n{preds}') df = table.df[['label','is_test']] df['preds'] = int(df.loc[[not i for i in table.directed_mask.tolist()],'label'].value_counts().index[0]) df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'preds'] = preds preds = df.loc[df['is_test']==1,'preds'].values LOGGER.info(f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}") df_preds = pd.Series(preds,name='preds') LOGGER.info(f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}") if offline: preds1 = preds1.argmax(axis=1).flatten() preds2 = preds2.argmax(axis=1).flatten() if table.especial: LOGGER.info(f'preds1\n{preds1}') df = table.df[['label','is_test']] df['preds'] = int(df.loc[[not i for i in table.directed_mask.tolist()],'label'].value_counts().index[0]) df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'preds'] = preds1 preds1 = df.loc[df['is_test']==1,'preds'].values LOGGER.info(f'preds2\n{preds2}') df = table.df[['label','is_test']] df['preds'] = int(df.loc[[not i for i in table.directed_mask.tolist()],'label'].value_counts().index[0]) df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'preds'] = preds2 preds2 = df.loc[df['is_test']==1,'preds'].values df_test = table.df[['degree','label','is_test']] df_test = df_test.loc[df_test['is_test']==1] df_test['preds'] = preds df_test['label'] = data['test_label']['label'].values df_test['acc'] = df_test['preds']==df_test['label'] pd.set_option('display.max_rows', 1000) print(df_test.groupby('degree')['acc'].mean()) return preds,valid_acc1,valid_acc2,preds1,preds2 else: return preds
key=lambda x: x not in mysubset)] def load_data(self, path: str): self.data = pd.read_csv(path, sep=';', encoding='cp1251', skiprows=1) self._formatting_data() def call_stat_by_day(self) -> pd.DataFrame: """ Статистика звонков менеджеров по дням :return: """ frm = lambda x: '{:,.2f}'.format(x) data = self.data.groupby([self.data['Дата'], self.data['Кто звонил']])['Длительность, сек'].\ agg(['count', 'sum', 'mean']).\ rename(columns={'count': 'Кол-во звонков', 'sum': 'Общая длит. звонков', 'mean': 'Сред. длит. звонков'}) data['Сред. длит. звонков'] = data['Сред. длит. звонков'].map( frm).astype(float) return data if __name__ == '__main__': pd.set_option("display.max.columns", None) dframe = PandasDataFrame() dframe.load_data('static.csv') df = dframe.call_stat_by_day().reset_index() print(df) df.plot(x=['Дата', 'Кто звонил'], y="Кол-во звонков") plt.show()
import nltk import pandas as pd # Remove Punctuation # Read dataset pd.set_option('display.max_colwidth', 100) # We can longer display data = pd.read_csv('../data/smsspamcollection/SMSSpamCollection', \ sep='\t', header=None) data.columns = ['label', 'msg'] print('set columns => data.head():') print(data.head()) import string print('\nstring.punctuation:', string.punctuation) def remove_punctuation(txt): # for all the character in txt. # if is not in punctuation txt_nopunct = "".join([c for c in txt if c not in string.punctuation]) return txt_nopunct data['msg_clean'] = data['msg'].apply(lambda x: remove_punctuation(x)) print('\nremove punctuation => data.head():') print(data.head()) # Tokenization import re
def labelquery(): query = "select ?s ?p ?o where { ?s ?p ?o. } limit 10" namespace = request.cookies.get('namespace', os.environ.get("ANNOTATION_NAMESPACE")) template = '' slots = '' results = {} current_labelquery_type = request.args.get("showtype", list(labelquery_types)[0], type=str) modeldir = os.environ.get("MODELDIR") labelqueries, templates = load_labelqueries_templates( modeldir, filter_disabled=False) if request.method == 'POST': form = dict(request.form) print(form) action = request.form.get('action') if action in [ 'query', 'count', 'setquery', 'save-template', 'save-labelquery' ]: template = request.form.get('template') slots = request.form.get('slots') label = current_labelquery_type value = 'http://karmaresearch.net/' + request.form.get('value', '') query = request.form.get('query') if not query: query = templates[request.form.get('template')] if action in ['query', 'count']: fmt_query = fill_template(query, slots) print(fmt_query) response = do_sparql(os.environ.get("KB"), fmt_query, namespace) if response.ok: results = response.json() if action == 'count': qid = request.form.get('qid') var_path_template = annotate.labelquery_types[label][ 'path'] n = len( LabelQuery.transformations_from_results( response.json(), var_path_template, value)) labelqueries[qid]['scores']['n_results'] = n save_labelqueries_templates(modeldir, labelqueries, templates) else: print(response.text) if action == 'save-template': templates[template] = query # (re-)calculate all scores for labelqueries with this template basedir = os.environ.get("BASEDIR") annotationdir = os.environ.get("ANNOTATIONDIR") namespace = os.environ.get("ANNOTATION_NAMESPACE") for qid, lq in labelqueries.items(): if lq['template'] == template: scores = eval_labelquery(templates[lq['template']], lq['slots'], lq['label'], lq['value'], basedir, annotationdir, namespace, name='') print(qid, scores) labelqueries[qid]['scores'] = scores if action == 'build-cache': # (re-)calculate all scores for labelqueries of this labeltype basedir = os.environ.get("BASEDIR") annotationdir = os.environ.get("ANNOTATIONDIR") namespace = os.environ.get("ANNOTATION_NAMESPACE") for qid, lq in labelqueries.items(): if lq['label'] == current_labelquery_type and lq[ 'template'] in templates: scores = eval_labelquery(templates[lq['template']], lq['slots'], lq['label'], lq['value'], basedir, annotationdir, namespace, name='') print(qid, scores) labelqueries[qid]['scores'] = scores if action == 'delete-labelquery': qid = request.form.get('qid') del labelqueries[qid] if action == 'save-labelquery': qid = request.form.get('qid') enabled = request.form.get('enabled') labelqueries[qid] = { 'template': template, 'slots': slots, 'label': label, 'value': value, 'enabled': enabled, } if request.form.get('template') not in templates: templates[request.form.get('template')] = '' # (re-)calculate labelquery scores basedir = os.environ.get("BASEDIR") annotationdir = os.environ.get("ANNOTATIONDIR") namespace = os.environ.get("ANNOTATION_NAMESPACE") scores = eval_labelquery(templates[template], slots, label, value, basedir, annotationdir, namespace, name='') print('re-calculated', scores) labelqueries[qid]['scores'] = scores if action in [ 'save-template', 'delete-labelquery', 'save-labelquery', 'build-cache' ]: # Save the queries themselves, including scores that have possibly been updated save_labelqueries_templates(modeldir, labelqueries, templates) # Run the updated queries on the gold namespace and save the results os.makedirs(os.path.join(modeldir, 'labelqueries', 'cache'), exist_ok=True) results_fname = os.path.join(modeldir, 'labelqueries', 'cache', 'gold.json') namespace = os.environ.get("ANNOTATION_NAMESPACE") kbdomain = os.environ.get("KB") if action == 'save-template': selected_queries = [ str(qid) for qid, lq in labelqueries.items() if lq['template'] == template ] elif action == 'build-cache': selected_queries = [ str(qid) for qid, lq in labelqueries.items() if lq['label'] == current_labelquery_type and lq['template'] in templates ] else: selected_queries = [str(qid)] import supervise labelquery_results = supervise.cache_labelquery_results( modeldir, namespace, kbdomain, selected_queries=selected_queries, results_fname=results_fname, verbose=True) # Save the entire labeling matrix using the query results basedir = os.environ.get("BASEDIR") annotationdir = os.environ.get("ANNOTATIONDIR") labeled_metas = list(annotate.get_metadata(basedir, annotationdir)) supervise.save_query_analysis(modeldir, labeled_metas, current_labelquery_type) view = request.args.get("view", None, type=str) if view: os.makedirs(os.path.join(modeldir, 'labelqueries', 'cache'), exist_ok=True) view_fname = os.path.join(modeldir, 'labelqueries', 'cache', f'{current_labelquery_type}-{view}.csv') if os.path.exists(view_fname): pd.set_option('display.max_colwidth', -1) view = pd.read_csv(view_fname) view.columns = [ c + '<a class="sort_btn" href="#"/>' for c in view.columns ] def make_url(c): c = 'http://karmaresearch.net/' + c if c != 'None' else c return url_for('view', label=c, showtype=current_labelquery_type) view[view.columns[0]] = [ f'<a href="{make_url(c)}">{c}</a>' for c in view[view.columns[0]] ] view = view.to_html(index=False, float_format='%.2f', border=0, escape=False, classes=['sortable']) else: view = f'{view} file does not exist' return render_template( 'labelquery.html', query=query, template=template, slots=slots, view=view, results=results, labelqueries=labelqueries, labelquery_types=labelquery_types, current_labelquery_type=current_labelquery_type, templates=templates, classes=sorted([ c.replace('http://karmaresearch.net/', '') for c in get_classes() ]), new_qid=str(max([int(i) for i in labelqueries], default=0) + 1), namespace=request.cookies.get('namespace', os.environ.get("ANNOTATION_NAMESPACE")), all_namespaces=get_namespaces(os.environ.get("KB")), )
import numpy as np import tensorflow as tf from datetime import datetime import os import sys import random import math import numpy as np import cv2 import matplotlib.pyplot as plt plt.rcParams["font.size"] = 15 #import seaborn as sns import json from tqdm import tqdm import pandas as pd pd.set_option("display.max_rows", 101) import glob from collections import Counter from PIL import Image os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]="0,1" #input_dir = "./advancedML/data/" ''' One Dataset is around more than 200mb The size of one batch is mainly determined by batch size, image size, maximum object number batch size = IMAGES_PER_GPU*GPU_COUNT in config.py image size is related to MAX_IMG_DIM,MIN_IMG_DIM in config.py maximum object numbers = MAX_GT_OBJECTS in config.py
num_vow += 1 else: num_con += 1 if (num_con > 0 and num_vow > 0) and (num_con > 1 or num_vow > 1): break j += 1 part2 = name2[j:len(name2)] new_name = part1 + part2 return new_name # Formatting to allow printing of an entire dataframe in Pycharm pd.options.display.width = None pd.options.display.max_columns = None pd.set_option('display.max_rows', 42) pd.set_option('display.max_columns', 42) # Creatures the player will select from lion = Creature("Lion", 240, 4, False, True, False, False) python = Creature("Python", 30, 0, False, True, True, False) dog = Creature("Dog", 90, 4, False, True, False, False) human = Creature("Human", 160, 2, False, False, False, False) trout = Creature("Trout", 3, 0, False, True, True, False) eagle = Creature("Eagle", 10, 2, True, True, False, False) dragon = Creature("Dragon", 2700, 4, True, True, True, True) ant = Creature("Ant", 0.0000022046, 6, False, False, False, False) octopus = Creature("Octopus", 80, 8, False, False, True, False) creatures = [lion, python, dog, human, trout, eagle, dragon, ant, octopus]