def main():
    parser = ArgumentParser(
        'analyse_data.py', description="Analyze Jenkins build logs"
    )
    parser.add_argument(
        '--since', type=dateutil.parser.parse,
        help="Only consider builds since this date"
    )
    opts = parser.parse_args()
    builds = load_build_data(since=opts.since)

    pandas.set_option('expand_frame_repr', False)
    print("Showing data since: ", opts.since)
    print("")
    print_summary_results(builds)
    print("")
    print("")
    build_data = make_subbuild_data_frame(builds)
    print_top_failing_jobs(build_data)
    print("")
    print("")
    classified_failure_data = get_classified_failures(build_data)
    print_common_failure_reasons(classified_failure_data)
    print("")
    print("")
    print_common_failure_daily(classified_failure_data)
    print("")
    print("")
    print_commonly_failing_tests(build_data)
Esempio n. 2
0
def main(argv):

    pd.set_option('display.width', 200)
    pd.set_option('display.height', 500)

    warnings.filterwarnings("ignore")

    global file_path, RMSLE_scorer

    # RMSLE_scorer
    RMSLE_scorer = metrics.make_scorer(RMSLE, greater_is_better = False)

    if(platform.system() == "Windows"):
        file_path = 'C:/Python/Others/data/Kaggle/Caterpillar_Tube_Pricing/'
    else:
        file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Caterpillar_Tube_Pricing/'

########################################################################################################################
#Read the input file , munging and splitting the data to train and test
########################################################################################################################
    Train_DS      = pd.read_csv(file_path+'competition_data/train_set.csv',sep=',')
    Actual_DS     = pd.read_csv(file_path+'competition_data/test_set.csv',sep=',')
    Tube_DS       = pd.read_csv(file_path+'competition_data/tube.csv',sep=',')
    Bill_DS       = pd.read_csv(file_path+'competition_data/bill_of_materials.csv',sep=',')
    Spec_DS       = pd.read_csv(file_path+'competition_data/specs.csv',sep=',')
    Tube_End_DS   = pd.read_csv(file_path+'competition_data/tube_end_form.csv',sep=',')
    Comp_DS       = pd.read_csv(file_path+'competition_data/components_2.csv',sep=',')
    Sample_DS     = pd.read_csv(file_path+'sample_submission.csv',sep=',')


    Train_DS, Actual_DS, y =  Data_Munging(Train_DS,Actual_DS,Tube_DS,Bill_DS,Spec_DS,Tube_End_DS, Comp_DS)

    pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, grid=False)
Esempio n. 3
0
def manlab_p(df_pkl, rx,  col):
    """
    Takes in pkl file, regex query, and column. Reads in pkl file
    as a pandas df. Subsets the regex query, and manually changes labels
    to df['col'] based on the post. Resaves the pkl file to update labels.

    Inputs:
        - pkl file
        - regex query
        - column

    Outputs:
        - pandas df

    """

    pd.set_option('max_colwidth' , 200)
    df = pd.read_pickle(df_pkl)
    df_res = df[df['post_type']== "responder"]
    df_lab = df_res[df_res['post'].str.contains(rx , regex = True)]
    proceed = input("Proceed: ")
    n = 1
    while proceed  == 'y':
        if n%10 == 0:
            proceed = input("Proceed: ")
        ind = int(np.random.choice(df_lab.index, size = 1))
        label = (input("Personal(0) {} / {}: ".format(n, df['post'].iloc[ind])))
        if label == 'x':
            break
        label = int(label)
        df[col][ind] = label
        n += 1
    #safety saving
    df.to_pickle('df_man.pkl')
    return df
Esempio n. 4
0
def main(directory, filename, max_rows, plots, stacked, style_line, style_dots):
    directory = os.path.expanduser(directory)
    filename = os.path.join(directory, filename)
    print("Reading %r" % filename)
    pd.set_option('display.max_rows', max_rows)

    df = pd.read_csv(filename)
    df[COL_T] = pd.to_datetime(df[COL_T])
    df = df.set_index(COL_T)
    print(df)

    if plots == '':
        plots = df.columns
    else:
        plots = plots.split(',')

    if stacked:
        fig, axs = plt.subplots(nrows=len(plots))
    
        for i, plot in enumerate(plots):
            ax = axs[i]
            ax.plot(df.index, df[plot], style_line)
            ax.plot(df.index, df[plot], style_dots)
            ax.set_xlabel(COL_T) #, fontdict=font)
            ax.set_ylabel(plot)
            #ax.set_title(plot)
    else:
        ax = df[plots].plot(style=style)

    plt.show()
Esempio n. 5
0
def print_table(table, name=None, fmt=None):
    """Pretty print a pandas DataFrame.

    Uses HTML output if running inside Jupyter Notebook, otherwise
    formatted text output.

    Parameters
    ----------
    table : pandas.Series or pandas.DataFrame
        Table to pretty-print.
    name : str, optional
        Table name to display in upper left corner.
    fmt : str, optional
        Formatter to use for displaying table elements.
        E.g. '{0:.2f}%' for displaying 100 as '100.00%'.
        Restores original setting after displaying.

    """
    if isinstance(table, pd.Series):
        table = pd.DataFrame(table)

    if fmt is not None:
        prev_option = pd.get_option('display.float_format')
        pd.set_option('display.float_format', lambda x: fmt.format(x))

    if name is not None:
        table.columns.name = name

    display(table)

    if fmt is not None:
        pd.set_option('display.float_format', prev_option)
def print_full(x):
    '''
    Helper function to plot the *full* dataframe.
    '''
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
Esempio n. 7
0
def buildModel(df):
	train_y = df['arr_del15'][:train_len]
	train_x = df[cols][:train_len]

	# transform categorical features
	train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0]
	train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0]
	train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0]
	
	pd.set_option('display.max_rows', 500)
	print(train_x)

	# train_x['origin'] = pd.factorize(train_x['origin'])[0]
	#	train_x['dest'] = pd.factorize(train_x['dest'])[0]
	# print(train_x)
	train_x = enc.fit_transform(train_x)
	print(train_x.shape)

	# Create Random Forest classifier with 50 trees
	clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
	clf_rf.fit(train_x.toarray(), train_y)

	del train_x, train_y
	print("Model built")
	return clf_rf
Esempio n. 8
0
def scrape():
    successful_scrapes = 0  # keep track of scrapes that went through
    for i in range(len(advertisers)):  # per advertiser
        raw_advertiser = advertisers[i].encode(
            "ascii", "xmlcharrefreplace").lower()
        print "advertiser raw:", raw_advertiser
        advertiser = raw_advertiser
        if "and" in advertiser:
            advertiser = advertiser.split(
                "and")[0].strip(" ")  # separate multiple
            # advertisers and search for only the first listed
        if ".com" in advertiser:
            advertiser = advertiser.replace(
                ".com", "")  # could prevent search result
        # get rid of spaces for file name
        stripped_advertiser = advertiser.replace(" ", "_")
        advertiser = urllib2.quote(advertiser)  # url safe version
        print "advertiser (url safe):", advertiser
        print "Working on advertiser {} out of {} ({})".format(
            i + 1, len(advertisers), raw_advertiser)
        try:
            short_url = rstyle_link(advertiser)  # get shortlink
            short_resp = requests.get(short_url)
            short_data = short_resp.content.encode("utf-8")  # generate unicode
        except Exception as error:
            print "Error while scraping. No results found for advertiser"
            continue  # nothing else to do without the result
        try:
            follow_on_url = re.search('<!-- (.*) -->', short_data).group(1)
            print "follow on url:", follow_on_url
            follow_on_resp = requests.get(follow_on_url)
            print "final follow on url:", follow_on_resp.url
            follow_on_data = str(follow_on_resp.content)  # final html
            text_cleaner(stripped_advertiser, follow_on_data.decode("utf-8"))
            print "advertiser {} scraped".format(i + 1)
            successful_scrapes += 1  # great work - scrape successful
        except Exception as error:
            print error

    print "Done with collecting the job postings!"
    print "There were", successful_scrapes, "scrapes performed successfully."

    intermediate_total_skills = fashion_dict
    overall_total_skills = {}
    for key in intermediate_total_skills:
        if intermediate_total_skills[key] > 0:
            overall_total_skills[key] = intermediate_total_skills[key]

    final_frame = pd.DataFrame(overall_total_skills.items(), columns=[
        "Term", "NumPostings"])
    pd.set_option('display.height', 500000)  # this is just an arbitrary max
    pd.set_option('display.max_rows', 500000)
    final_frame.NumPostings = (
        (final_frame.NumPostings)*100)/successful_scrapes
    final_frame.sort_values(by="NumPostings", ascending=False, inplace=True)
    import time
    with open('sites/word_freq_{}.txt'.format(int(time.time())), 'w') as f:
        f.write(str(final_frame))  # why not write out the analytics to file?

    return final_frame  # End of the function
Esempio n. 9
0
def main(argv):

    pd.set_option("display.width", 200)
    pd.set_option("display.height", 500)

    warnings.filterwarnings("ignore")

    global file_path, gini_scorer

    # Normalized Gini Scorer
    gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better=True)

    if platform.system() == "Windows":
        file_path = "C:/Python/Others/data/Kaggle/Liberty_Mutual_Group/"
    else:
        file_path = "/home/roshan/Desktop/DS/Others/data/Kaggle/Liberty_Mutual_Group/"

    ########################################################################################################################
    # Read the input file , munging and splitting the data to train and test
    ########################################################################################################################
    Train_DS = pd.read_csv(file_path + "train.csv", sep=",", index_col=0)
    Actual_DS = pd.read_csv(file_path + "test.csv", sep=",", index_col=0)
    Sample_DS = pd.read_csv(file_path + "sample_submission.csv", sep=",")
    Parms_XGB_DS = pd.read_csv(file_path + "Parms_DS_XGB_1001.csv", sep=",")
    Parms_RF_DS = pd.read_csv(file_path + "Parms_DS_RF2.csv", sep=",")

    Train_DS, Actual_DS, y = Data_Munging(Train_DS, Actual_DS)

    pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, Parms_RF_DS, Grid=False, Ensemble=False)
Esempio n. 10
0
def summarize_results_basic_size(sim_num, file_name):
	pd.set_option('display.width', 99999)
	pd.set_option('display.max_rows', 400)

	data = pd.read_csv(file_name, sep = '\t')
	data_summarize = pd.DataFrame()

	rest_wait_avgs = []
	guest_wait_avgs = []

	for i in range(sim_num):
		rest_case_avg = data.iloc[:,5+i*8].sum()/len(data.index)
		rest_wait_avgs.append(rest_case_avg)
		guest_case_avg = data.iloc[:,7+i*8].sum()/len(data.index)
		guest_wait_avgs.append(guest_case_avg)

	data_summarize['rest_wait_avgs'] = rest_wait_avgs
	data_summarize['guest_wait_avgs'] = guest_wait_avgs

	summarize_file_name = "sim_summarize.txt"

	data_summarize.to_csv(summarize_file_name, sep='\t', header=True, index=False)

	print rest_wait_avgs
	print guest_wait_avgs
Esempio n. 11
0
def main(argv):

    pd.set_option('display.width', 200)
    pd.set_option('display.height', 500)

    warnings.filterwarnings("ignore")

    global file_path, Train_DS1, Featimp_DS

    #random.seed(1)

    if(platform.system() == "Windows"):

        file_path = 'C:/Python/Others/data/Kaggle/Springleaf_Marketing_Response/'
    else:
        file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Springleaf_Marketing_Response/'

########################################################################################################################
#Read the input file , munging and splitting the data to train and test
########################################################################################################################
    #Train_DS      = pd.read_csv(file_path+'train.csv',sep=',')
    #Actual_DS     = pd.read_csv(file_path+'test.csv',sep=',')

    Train_DS      = pd.read_csv(file_path+'train_25000.csv',sep=',', index_col=0,nrows = 5000 ).reset_index(drop=True)
    Actual_DS     = pd.read_csv(file_path+'test_25000.csv',sep=',', index_col=0,nrows = 5000).reset_index(drop=True)

    Sample_DS     = pd.read_csv(file_path+'sample_submission.csv',sep=',')
    Filter_DS     = pd.read_csv(file_path+'Min_Max_DS_Analysis2.csv',sep=',')
    Featimp_DS    = pd.read_csv(file_path+'feature_imp.csv',sep=',')

    Train_DS, Actual_DS, y =  Data_Munging(Train_DS,Actual_DS, Filter_DS)

    #pred_Actual = XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid=False)
    pred_Actual  = RFC_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid=False)
def parsefrcschedule():
        
    with open('currentsched.html') as file:
        
        htmldata = file.read()
        #print(htmldata)
        
        pandas.set_option('display.width', 1000)
             
        eventlist = get_dates(htmldata)
        print('Data found for', len(eventlist), 'Regional events\n')
        #pprint(eventlist)
        
        regionalLocs = formLocationList(eventlist)        
        print('\nPreparing mileage search for', len(regionalLocs), 'Regional events\n')
        maprequest = prepmaprequest(home, regionalLocs)
        
        dmatrix = getdistancematrix(maprequest)           
        #pprint(dmatrix)
        
        print('Merging distance and event information\n')
        eventlist = mergeEventMilage(eventlist, dmatrix)
        
                
        final = evaluatedates(eventlist)
        
        missingevents(eventlist)
Esempio n. 13
0
 def blackbox_method_int(self, output, func_name):
     """
     Helper method to reuse code for testing numpy array outputs from SIP model
     :param output: String; Pandas Series name (e.g. column name) without '_out'
     :return:
     """
     try:
         # display model output in scientific notation
         pd.set_option('display.float_format','{:.4E}'.format)
         logging.info('### blackbox out_' + output)
         logging.info(iec_calc.pd_obj_out)
         result = iec_calc.pd_obj_out["out_" + output]
         expected = iec_calc.pd_obj_exp["exp_" + output]
         tab = pd.concat([result, expected], axis=1)
         #print(" ")
         #print(tabulate(tab, headers='keys', tablefmt='fancy_grid'))
         #npt.assert_array_almost_equal(result, expected, 4, '', True)
         rtol = 1e-5
         npt.assert_allclose(result,expected,rtol,0,'',True)
     finally:
         tab = pd.concat([result, expected], axis=1)
         print("\n")
         print(func_name)
         print(tabulate(tab, headers='keys', tablefmt='rst'))
     return
Esempio n. 14
0
def response_to_dataframe(resp, reserved, **frame_params):
    expand_tags = frame_params.pop('expand_tags', True)
    enc_resp = []
    fields = ['tags', 'key']
    for el in resp:
        for field in fields:
            dictionary = el.pop(field, None)
            if dictionary is not None:
                for k, v in dictionary.items():
                    if (expand_tags and (k in reserved)) or not expand_tags:
                        k = '{}.{}'.format(field, k)
                    el[k] = v
        if 'date' in el:
            # Message or Property
            el['date'] = to_date(el['date'])
        else:
            # Entity
            el['createdDate'] = to_date(el['createdDate'])
            if 'lastInsertDate' in el:
                el['lastInsertDate'] = to_date(el['lastInsertDate'])
        enc_resp.append(el)
    import pandas as pd
    pd.set_option("display.expand_frame_repr", False)
    pd.set_option('max_colwidth', -1)
    return pd.DataFrame(enc_resp, **frame_params)
Esempio n. 15
0
def pandas_repr(mat, col_names=None, row_names=None, margin=2, **pd_options):
    import pandas as pd
    for k, v in pd_options.items():
        k = k.replace('__', '.')
        pd.set_option(k, v)
    df = pd.DataFrame(data=mat, index=row_names, columns=col_names)
    return repr(df).replace('\n', '\n' + ' ' * margin)
Esempio n. 16
0
def transform(db_table):
    df = db_table.fillna(0)
    df.loc[df.country == "Argentina", "country"] = "Argentina + Uruguay"
    df.loc[df.country == "Uruguay", "country"] = "Argentina + Uruguay"
    df.loc[df.country == "Malaysia", "country"] = "Malaysia + Singapore"
    df.loc[df.country == "Singapore", "country"] = "Malaysia + Singapore"

    combined_countries = df.groupby(["country","report_date"],as_index=False, sort=False).sum()
    combined_total = df.groupby(["report_date"],as_index=False, sort=False).sum()

    combined_total["country"] = "Total"

    daily_report = combined_total.append(combined_countries,True)

    daily_report["avg_pax/trip"] = daily_report["pax_transported"]/daily_report["trip_realized"]
    daily_report["Realize_Trip_Yield"] = 100*(daily_report["trip_realized"]/daily_report["trip_offered"])
    daily_report["Avg_Realized_Km"] = daily_report["seats_km"]/daily_report["pax_transported"]
    daily_report["Driver_Cancel_Rate"] = 100*daily_report["driver_cancelation"]/(daily_report["pax_transported"]+daily_report["pax_cancelation"]+daily_report["driver_rejection"]+ daily_report["driver_cancelation"])
    daily_report["Pax_Cancel_Rate"] = 100*daily_report["pax_cancelation"]/(daily_report["pax_transported"]+daily_report["pax_cancelation"]+daily_report["driver_rejection"]+ daily_report["driver_cancelation"])

    pd.set_option('precision', 2)
    daily_report = daily_report.fillna(0)

    from datetime import date, timedelta
    yesterday = date.today() - timedelta(1)
    df_sorting = daily_report.loc[daily_report.report_date == yesterday, ["country", "pax_transported"]]
    mapped_values = df_sorting.set_index('country')['pax_transported'].to_dict()
    daily_report['sort'] = daily_report['country'].map(lambda x: mapped_values[x])
    daily_report.sort(['sort', 'country', 'report_date'], ascending=[0,1,0], inplace=True)
    
    return daily_report
Esempio n. 17
0
def transform(db_table):
    db_table = db_table.convert_objects(convert_numeric=True)
    db_table['country'] = db_table['country'].astype(str)
    db_table['report_date'] = db_table['reference_date'].astype(datetime)
    db_table.drop('reference_date', axis=1, inplace=True)

    db_table = db_table.fillna(0)
    total = db_table.groupby(["report_date"],as_index=False, sort=False).sum()
    total["id"] = "Total"
    total["country"] = "Total"

    daily_report = total.append(db_table,True)

    daily_report["avg_pax/trip"] = daily_report["pax_transported"]/daily_report["trip_realized"]
    daily_report["Realize_Trip_Yield"] = 100*(daily_report["trip_realized"]/daily_report["trip_offered"])
    daily_report["Avg_Realized_Km"] = daily_report["seats_distance"]/daily_report["pax_transported"]
    daily_report["Driver_Cancel_Rate"] = 100*daily_report["trip_cancelation"]/(daily_report["pax_transported"])
    daily_report["Pax_Cancel_Rate"] = 100*daily_report["booking_cancelation"]/(daily_report["pax_transported"])

    pd.set_option('precision', 2)
    daily_report = daily_report.fillna(0)

    daily_report.drop('id', axis=1, inplace=True)

    from datetime import date, timedelta
    yesterday = date.today() - timedelta(1)
    df_sorting = daily_report.loc[daily_report.report_date == yesterday, ["country", "pax_transported"]]
    mapped_values = df_sorting.set_index('country')['pax_transported'].to_dict()
    daily_report['sort'] = daily_report['country'].map(lambda x: mapped_values[x])
    daily_report.sort(['sort', 'country', 'report_date'], ascending=[0,1,0], inplace=True)
    
    return daily_report
Esempio n. 18
0
def main(argv):

    pd.set_option('display.width', 500)
    pd.set_option('display.height', 500)
    pd.options.mode.chained_assignment = None  # default='warn'
    gc.enable()


########################################################################################################################
#Read the input file , munging and splitting the data to train and test
########################################################################################################################
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')

    #train =  pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/train.csv',sep=',')
    #actual =  pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/test.csv',sep=',')
    #Sample_DS = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/sampleSubmission.csv',sep=',')
    Keys = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/key.csv',sep=',')
    Weather = pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/weather 3.csv',sep=',')

    train =  pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/X_train.csv',sep=',')
    CV =  pd.read_csv('C:/Python/Others/data/Kaggle/Walmart_Recruiting/X_CV.csv',sep=',')

    Y_train = train.units.values
    Y_CV    = CV.units.values
    X_train = train.drop(['units'], axis=1)
    X_CV    = CV.drop(['units'], axis=1)

    print(np.shape(X_train))
    print(np.shape(X_CV))

    p_cv_RFC = RanFst_Regressor(X_train, X_CV, Y_train,Y_CV)
def main(argv):

    pd.set_option('display.width', 200)
    pd.set_option('display.height', 500)

    warnings.filterwarnings("ignore")

    global file_path, Train_DS1, Featimp_DS

    #random.seed(1)

    if(platform.system() == "Windows"):

        file_path = 'C:/Python/Others/data/Kaggle/Walmart_Recruiting_TTC/'
    else:
        file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Walmart_Recruiting_TTC/'

########################################################################################################################
#Read the input file , munging and splitting the data to train and test
########################################################################################################################
    #Train_DS      = pd.read_csv(file_path+'train.csv',sep=',')
    #Actual_DS     = pd.read_csv(file_path+'test.csv',sep=',')

    Train_DS    = pd.read_csv(file_path+'train_50000.csv',sep=',',index_col=0,nrows = 8000)
    Actual_DS   = pd.read_csv(file_path+'test_50000.csv',sep=',',index_col=0,nrows = 8000)

    Sample_DS     = pd.read_csv(file_path+'sample_submission.csv',sep=',')

    #For testing only
    # Train_DS      = pd.read_csv(file_path+'train_100000.csv',sep=',', index_col=0,nrows = 1000 ).reset_index(drop=True)
    # Actual_DS     = pd.read_csv(file_path+'test_100000.csv',sep=',', index_col=0,nrows = 1000).reset_index(drop=True)

    Data_Munging(Train_DS,Actual_DS)
Esempio n. 20
0
def show_shared_capsules():
    guids = 'A|B'.split("|")
    pandas.set_option('display.width', 5000)
    pandas.set_option('max_colwidth', 5000)
    for guid in guids:
        resp = show_capsules(guid)['status']
        header_list = ['vmid', 'role', 'vm_tou', 'type', 'full_access', 'users_full_access', 'roles']
        df = pandas.DataFrame(columns=header_list)
        for res in resp:
            roles = res['roles']
            roles_list = ''
            for role in roles:
                roles_list += ' [' + role['guid'] + ' ' + ' | ' +  role['role'] + ' | tou:'  +  str(role['tou']) \
                              + ' | fa:' + str(role['full_access']) + ' ] '
            df2 = pandas.DataFrame([[ res['vmid'],
                                      res['role'],
                                      str(res['vm_tou']),
                                      res['type'],
                                      str(res['full_access']) ,
                                      str(res['user_full_access']),
                                      roles_list]], columns=header_list)
            df = df.append(df2, ignore_index=True)
        if df.empty != True:
            print('------------------user:'******'-------------------------')
            print(df)
            print('-------------------------------------------------')
Esempio n. 21
0
def print_full(df): 
    '''
    print all rows of pd.DataFrame
    '''
    pd.set_option('display.max_rows', len(df))
    print(df)
    pd.reset_option('display.max_rows')
Esempio n. 22
0
def main(directory, filename_data_in, max_rows, max_frames, font):
    logging.basicConfig(level=logging.INFO)
    pd.set_option('display.max_rows', max_rows)
    
    if max_frames == -1:
        max_frames = None

    directory = os.path.expanduser(directory)

    overlay = MyVideoOverlay(directory, filename_data_in, font)
    
    overlay.fields = ['frame', 'pos']
    #overlay.fields = ['t0', 'pos']
    
    #overlay.data_formatter = DataFormatter()
    
    overlay.data_formatter.key_format_default = '%13s'
    #overlay.data_formatter.d_key_format = {
    #    'frame': '%s',
    #}

    #overlay.data_formatter.value_format_default = '%s'
    overlay.data_formatter.d_value_format = {
        'frame': '%06d',
        'pos': '%05.1f',
    #    't0': '%07.3f'
    }


    overlay.create_images(framenumber_max=max_frames)
Esempio n. 23
0
def transform(table):
	df = table.fillna(0)
	df.loc[df.country == "Argentina", "country"] = "Argentina + Uruguay"
	df.loc[df.country == "Uruguay", "country"] = "Argentina + Uruguay"
	df.loc[df.country == "Malaysia", "country"] = "Malaysia + Singapore"
	df.loc[df.country == "Singapore", "country"] = "Malaysia + Singapore"

	combined_countries = df.groupby(["country","report_reference","reference_number","report_year"],as_index=False, sort=False).sum()
	combined_total = df.groupby(["report_reference","reference_number","report_year"],as_index=False, sort=False).sum()

	combined_total["country"] = " Total"

	report = combined_total.append(combined_countries,True)

	report["avg_pax/trip"] = report["pax_transported"]/report["trip_realized"]
	report["Realize_Trip_Yield"] = 100*(report["trip_realized"]/report["trip_offered"])
	report["Avg_Realized_Km"] = report["seats_km"]/report["pax_transported"]
	report["Driver_Cancel_Rate"] = 100*report["driver_cancelation"]/(report["pax_transported"]+report["pax_cancelation"]+report["driver_rejection"]+ report["driver_cancelation"])
	report["Pax_Cancel_Rate"] = 100*report["pax_cancelation"]/(report["pax_transported"]+report["pax_cancelation"]+report["driver_rejection"]+ report["driver_cancelation"])

	pd.set_option('precision', 2)
	report = report.fillna(0)

	report.sort(['country','report_year','reference_number'], ascending=[1,0,0], inplace=True)

	columns = ['country','report_year','report_reference','new_users','new_drivers','new_passangers','new_trip_offered','new_bookings','ask',\
				'trip_realized', 'unique_trip_realized','pax_transported','unique_pax_driver','avg_pax/trip','Realize_Trip_Yield','Avg_Realized_Km','Driver_Cancel_Rate','Pax_Cancel_Rate']
	report = report[columns]
	
	return report
Esempio n. 24
0
def concprinter(df, kind = 'string', n = 100, window = 60, columns = 'all', **kwargs):
    """
    Print conc lines nicely, to string, latex or csv

    :param df: concordance lines from :class:``corpkit.corpus.Concordance``
    :type df: pd.DataFame 
    :param kind: output format
    :type kind: str ('string'/'latex'/'csv')
    :param n: Print first n lines only
    :type n: int/'all'
    :returns: None
    """
    import corpkit
    if n > len(df):
        n = len(df)
    if not kind.startswith('l') and kind.startswith('c') and kind.startswith('s'):
        raise ValueError('kind argument must start with "l" (latex), "c" (csv) or "s" (string).')
    import pandas as pd
    
    # shitty thing to hardcode
    pd.set_option('display.max_colwidth', 100)

    if type(n) == int:
        to_show = df.head(nq)
    elif n is False:
        to_show = df
    elif n == 'all':
        to_show = df
    else:
        raise ValueError('n argument "%s" not recognised.' % str(n))

    def resize_by_window_size(df, window):
        df['l'] = df['l'].str.slice(start=-window, stop=None)
        df['l'] = df['l'].str.rjust(window)
        df['r'] = df['r'].str.slice(start = 0, stop = window)
        df['r'] = df['r'].str.ljust(window)
        df['m'] = df['m'].str.ljust(df['m'].str.len().max())
        return df

    if window:
        to_show = resize_by_window_size(to_show, window)

    if columns != 'all':
        to_show = to_show[columns]

    if kind.startswith('s'):
        functi = pd.DataFrame.to_string
    if kind.startswith('l'):
        functi = pd.DataFrame.to_latex
    if kind.startswith('c'):
        functi = pd.DataFrame.to_csv

    return_it = kwargs.pop('return_it', False)

    if return_it:
        return functi(to_show, header = False, **kwargs)
    else:
        print('\n')
        print(functi(to_show, header = False, **kwargs))
        print('\n')
def fun1():

    import read_new_nepr_file
    import os
    import pandas
    pandas.set_option('display.width', 200)

    def get_files_in_folder(path):
        listing = os.listdir(path)
        return [os.path.join(path, l) for l in listing if os.path.isfile(os.path.join(path, l))]

    def get_folders_in_folder(path):
        listing = os.listdir(path)
        return [os.path.join(path, l) for l in listing if not os.path.isfile(os.path.join(path, l))]

    path = 'C:\\Users\\tech5\\Google Drive\\NEPR Actual'

    file_list = []
    for folder in get_folders_in_folder(path):
        for file in get_files_in_folder(folder):
            if '(' in file and '.xlsx' in file:
                file_list.append(file)


    df = read_new_nepr_file.read_nepr_file(file_list[0])
    for f in file_list[1:]:
        print(f)
        df = df.append(read_new_nepr_file.read_nepr_file(f), ignore_index=True)

    return df
Esempio n. 26
0
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
def process_uploaded_table(table_file, table_name, table_title, table_legend, associated_text):
    """takes an uploaded data table and associated metadata like legend and title and creates an html
        data table"""
    # convert file to pandas data frame
    pd.set_option('display.max_colwidth', 100)
    df = pd.read_csv(table_file, prefix = '', encoding = 'utf-8', index_col=False)


    num_cols = len(df.columns)
    table_html = df.to_html(index = False,na_rep = '', sparsify = False)

    # now use beautiful soup to append the table metadata

    table_soup = BeautifulSoup(table_html)
    table_tag = table_soup.table

    title_tag = table_soup.new_tag("caption")
    title_tag.string = "%s: %s" % (table_name, table_title)
    table_tag.insert(0, title_tag)

    table_body_tag = table_soup.find("tbody")

    legend_str = "%s: %s" % (table_legend, associated_text)
    footer_tag = BeautifulSoup('<tfoot><tr><td colspan="%d">%s</td></tr></tfoot>' % (num_cols, legend_str))
    table_body_tag.insert_after(footer_tag)

    # iterate through all th tags and check if they contain a string like "Unnamed: 0" and remove
    thTags = table_soup.findAll('th')
    for tag in thTags:
        if 'Unnamed' in tag.string:
            tag.string = ''
    return str(table_soup)
Esempio n. 28
0
    def main_report(self, day):

        '''
        dateline=%s" % day
        '''
        pandas.set_option('display.width', 200)
        d2 = self.mysql.getRecord("select s_code from s_stock_list where dateline=%s" % day)
        #print d2
        #sys.exit()
        for i in range(0, len(d2)):
            s_code = d2[i][0]
            #if s_code != 'sh600000':
            #    continue
            self._chQ = self.getChuQuan(s_code)
            #print self._chQ
            sql_data = "select s_code,code,dateline,chg_m,chg,open,close,high,low,last_close,name FROM s_stock_trade WHERE s_code ='%s' and dateline >20140101 " % s_code
            print sql_data
            tmpdf2 = pandas.read_sql(sql_data, self.mysql.db)
            tmpdf = tmpdf2.apply(self.format_chuquan_hanlder, axis=1)
            tmpdf.sort_values(by=('dateline'), ascending=False)

            ma_list = [5, 10, 20, 30, 60]
            for ma in ma_list:
                tmpdf['MA_' + str(ma)] = pandas.rolling_mean(tmpdf['close'], ma)

            last5 = tmpdf.tail(60)
            #print last5
            #sys.exit()
            for i5 in range(0, len(last5)):
                if str(last5.iloc[i5].dateline) != day:
                    continue

                word = s_code[2:] + str(last5.iloc[i5].dateline)
                if math.isnan(last5.iloc[i5].MA_5):
                    break
                if math.isnan(last5.iloc[i5].MA_10):
                    break

                _m60 = last5.iloc[i5].MA_60
                if math.isnan(last5.iloc[i5].MA_60):
                    _m60 = 0
                else:
                    _m60 = round(_m60, 2)
                _m30 = last5.iloc[i5].MA_30
                if math.isnan(last5.iloc[i5].MA_30):
                    _m30 = 0
                else:
                    _m30 = round(_m30, 2)

                item = {}
                item['s_code'] = s_code
                item['dateline'] = last5.iloc[i5].dateline
                item['hash'] = hashlib.md5(word).hexdigest()
                item['ma5'] = round(last5.iloc[i5].MA_5, 2)
                item['ma10'] = round(last5.iloc[i5].MA_10, 2)
                item['ma20'] = round(last5.iloc[i5].MA_20, 2)
                item['ma30'] = _m30
                item['ma60'] = _m60

                self.mysql.dbInsert('s_stock_average', item)
Esempio n. 29
0
def wrapper(args):

    try:
        npar = args.noiseparam.strip("[").strip("]").split(",")
    except:
        npar = []
    nbins = args.nbins
    # Run funciton
    if args.i:
        df = pd.io.parsers.read_csv(args.i, delim_whitespace=True, dtype={"seqs": str, "batch": int})
    else:
        df = pd.io.parsers.read_csv(sys.stdin, delim_whitespace=True, dtype={"seqs": str, "batch": int})
    if len(utils.get_column_headers(df)) > 0:
        raise SortSeqError("Library already sorted!")
    model_df = io.load_model(args.model)
    output_df = main(df, model_df, args.noisemodel, npar, nbins, start=args.start, end=args.end)

    if args.out:
        outloc = open(args.out, "w")
    else:
        outloc = sys.stdout
    pd.set_option("max_colwidth", int(1e8))

    # Validate dataframe for writting
    output_df = qc.validate_dataset(output_df, fix=True)
    io.write(output_df, outloc)
Esempio n. 30
0
def main(argv):

    pd.set_option('display.width', 200)
    pd.set_option('display.height', 500)
    pd.options.mode.chained_assignment = None  # default='warn'

########################################################################################################################
#Read the input file , munging and splitting the data to train and test
########################################################################################################################
    train =  pd.read_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/train.csv',sep=',')
    test =  pd.read_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/test.csv',sep=',')
    Sample_DS = pd.read_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/sampleSubmission.csv',sep=',')

    X,Xt,y = Data_Munging(train,test,Sample_DS)

    X,Xt,y = Feature_Selection(X,Xt,y)

    #scores = Kfold_Cross_Valid(X,Xt,y)

    clf = GridSrch_Modelfit(X,Xt,y,grid=False)

    #Predict test.csv & reverse the log transform
    yp=np.exp(clf.predict(Xt))

########################################################################################################################
#Get the predictions for actual data set
########################################################################################################################
    #Get the predictions for actual data set
    preds = pd.DataFrame(yp, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:])
    preds.to_csv('C:/Python/Others/data/Kaggle/Restaurant_Revenue_Prediction/Submission_Roshan.csv', index_label='Id')
import os
import glob
import json

import click
import numpy as np
import pandas as pd
from loguru import logger

import mwrvr.constants
import mwrvr.misc
import mwrvr.textract


desired_width = 320
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 100)

INT_COLUMNS = ["score", "kills", "deaths", "plants", "defuses", "top_fragger"]

COLUMNS = ["name", "map", "score", "kills", "deaths", "plants", "defuses",
                  "number_of_maps", "top_fragger", "zero_bomb"]

PER_MAP_COLUMNS = ["kills_per_map", "deaths_per_map", "plants_per_map",
                   "defuses_per_map", "top_fragger_per_map", "zero_bomb_per_map", "score_per_map"]


def find_misspellings(s: str, include_bradlx888_as_ntsfbrad=True):
    s = mwrvr.misc.find_jaantr(s)
    s = mwrvr.misc.find_ntsfbrad(s, include_bradlx888=include_bradlx888_as_ntsfbrad)
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


data = pd.read_csv("nursery.csv", sep=';')
pd.set_option('display.expand_frame_repr', False)

print ("\n **********\n Stampa delle prime 5 righe del dataset: \n *********\n")
display(data.head()) 
print ("\n **********\n Stampa delle statistiche descrittive: \n *********\n")
display(data.describe())
print ("\n **********\n Numero totale di dati e di attributi: \n *********\n")
print(data.shape)


#PREPROCESSING
#data= data.convert_objects(convert_numeric=True)

# non ci sono dati mancanti. Parò ci sono molti dati 
# categoriali che vanno trasformati in numerici mediante la discretizzazione.
def label_encode(df, columns):
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
print(cross_val_score(rf_classifier, features, label, cv=20, scoring ='accuracy').mean())

"""# Observation 2:

Accuracy acquired : 84.2 % for n_splits=20 \\

## Important Feature Calculation
"""

rf_classifier.fit(X_train,y_train)

df.head(3)

pd.set_option("display.max.rows", None)

# Important Features
feature_importance = pd.DataFrame(rf_classifier.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importance
# summarize feature importance
#for i,v in enumerate(importance):
 #   print('Feature: %0d, Score: %.5f' % (i,v))

"""Exporting the results into an excel file for better view"""

excel_data = feature_importance.copy()

excel_data.to_csv('important_features.csv', index=True)
Esempio n. 34
0
        for c in self.categoric_columns:
            if c != 'Name' and c != 'RescuerID' and c != 'Description' and c != 'PetID':
                self.boxplot_for_category_column(self.df, c)

    def pairplot_for_numeric(self):
        for c in self.numeric_columns:
            if c != COL_Y:
                self.pairplot_for_numeric_column(self.df, c)

    def boxplot_for_numeric(self):
        for c in self.numeric_columns:
            if c != COL_Y:
                self.boxplot_for_numeric_column(self.df, c)


pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 100000)

train = Pets(TRAIN_DATA, True, calculation_limit_rows=None)
# train.bag_of_words_clean()
# train.bag_of_words_prepare()
# train.catplot_for_categories()
# train.pairplot_for_numeric()
# train.boxplot_for_numeric()
# train.boxplot_for_categories()
# train.train_model()
train.roc_curve()

# predict = Pets(TEST_DATA, False, calculation_limit_rows=None)
# predict.bag_of_words_clean()
Esempio n. 35
0
def main(argv):
    args = parse_args(argv)

    # set up display and plotting options
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.width', get_terminal_size()[1])
    sns.set_style('whitegrid')
    sns.set_context('poster')
    #sns.set_palette('Set1')

    if args.out_prefix is None:
        args.out_prefix = os.path.splitext(args.expt_file)[0]

    seeds = args.seeds.split(',')
    folds = args.folds.split(',')

    # get all training output data from experiment
    job_files = args.job_script
    df = read_training_output_files(job_files, args.data_name, seeds, folds,
                                    args.iteration, True, args.gen_metrics)

    if args.test_data is not None:
        df = df[df['test_data'] == args.test_data]

    group_cols = ['job_file', 'model_name']

    if not args.avg_seeds:
        group_cols.append('seed')

    if not args.avg_folds:
        group_cols.append('fold')

    if args.avg_iters > 1:
        df['iteration'] = args.avg_iters * (df['iteration'] // args.avg_iters)
    group_cols.append('iteration')

    exclude_cols = [
        'job_file', 'model_name', 'gen_model_name', 'disc_model_name',
        'iteration', 'seed', 'fold', 'test_data'
    ]

    agg_df = aggregate_data(df, group_cols)
    #assert all(agg_df['seed'] == set(seeds))
    #assert all(agg_df['fold'] == set(folds))

    if not args.y:  # use all training output metrics
        args.y = [m for m in agg_df if m not in exclude_cols]
        if args.scaffold:
            args.y += [
                p + x for p in ['gen_', 'disc_']
                for x in ['n_params', 'n_activs', 'size', 'min_width']
            ]
        args.y = sorted(args.y, key=get_y_key, reverse=True)

    # parse model name to get model params and add columns
    job_params = add_param_columns(agg_df, scaffold=args.scaffold)

    print('\nAGGREGATED DATA')
    print(agg_df)

    # rename columns if necessary
    agg_df.reset_index(inplace=True)
    col_name_map = {col: col for col in agg_df}
    col_name_map.update(dict(r.split(':') for r in args.rename_col))
    agg_df.rename(columns=col_name_map, inplace=True)
    job_params = {col_name_map[c]: v for c, v in job_params.items()}

    for y in args.log_y:  # add log y columns
        log_y = 'log({})'.format(y)
        agg_df[log_y] = agg_df[y].apply(np.log)
        args.y.append(log_y)

    if len(args.hue) > 1:  # add column for hue tuple
        hue = add_group_column(agg_df, args.hue)
    elif len(args.hue) == 1:
        hue = args.hue[0]
    else:
        hue = None

    # by default, don't make plots for the hue variable or variables with 1 unique value
    if not args.x:
        args.x = [
            c for c in job_params
            if c not in exclude_cols and agg_df[c].nunique() > 1
        ]
        args.x = sorted(args.x, key=get_x_key, reverse=True)

    if args.grouped:  # add "all but one" group columns
        for col in args.x:
            all_but_col = [c for c in args.x if c not in {col, 'memory'}]
            add_group_column(agg_df, all_but_col)

    agg_df.to_csv('{}_agg_data.csv'.format(args.out_prefix))

    for y in args.y:
        z_bounds = get_z_bounds(agg_df[y], args.outlier_z)
        iqr_bounds = get_iqr_bounds(agg_df[y], args.outlier_iqr)
        print(y, z_bounds, iqr_bounds)
        agg_df[y] = remove_outliers(agg_df[y], z_bounds)
        agg_df[y] = remove_outliers(agg_df[y], iqr_bounds)

    if args.plot_lines:  # plot training progress

        line_plot_file = '{}_lines.{}'.format(args.out_prefix, args.plot_ext)
        plot_lines(line_plot_file,
                   agg_df,
                   x=col_name_map['iteration'],
                   y=args.y,
                   hue=None,
                   n_cols=args.n_cols,
                   outlier_z=args.outlier_z,
                   ylim=args.ylim)

        for hue in args.x + ['model_name']:
            line_plot_file = '{}_lines_{}.{}'.format(args.out_prefix, hue,
                                                     args.plot_ext)
            plot_lines(line_plot_file,
                       agg_df,
                       x=col_name_map['iteration'],
                       y=args.y,
                       hue=hue,
                       n_cols=args.n_cols,
                       outlier_z=args.outlier_z,
                       ylim=args.ylim)

    if args.iteration:
        final_df = agg_df.set_index(
            col_name_map['iteration']).loc[args.iteration]

        print('\nFINAL DATA')
        print(final_df)

        # display names of best models
        print('\nBEST MODELS')
        for y in args.y:
            print(
                final_df.sort_values(y).loc[:, (col_name_map['model_name'],
                                                y)])  #.head(5))

        if args.plot_strips:  # plot final loss distributions

            strip_plot_file = '{}_strips.{}'.format(args.out_prefix,
                                                    args.plot_ext)
            plot_strips(strip_plot_file,
                        final_df,
                        x=args.x,
                        y=args.y,
                        hue=None,
                        n_cols=args.n_cols,
                        outlier_z=args.outlier_z,
                        ylim=args.ylim)

            if args.grouped:
                strip_plot_file = '{}_grouped_strips.{}'.format(
                    args.out_prefix, args.plot_ext)
                plot_strips(strip_plot_file,
                            final_df,
                            x=args.x,
                            y=args.y,
                            hue=None,
                            grouped=True,
                            n_cols=args.n_cols,
                            outlier_z=args.outlier_z,
                            ylim=args.ylim)

            for hue in args.x + ['model_name']:
                strip_plot_file = '{}_strips_{}.{}'.format(
                    args.out_prefix, hue, args.plot_ext)
                plot_strips(strip_plot_file,
                            final_df,
                            x=args.x,
                            y=args.y,
                            hue=hue,
                            n_cols=args.n_cols,
                            outlier_z=args.outlier_z,
                            ylim=args.ylim)

        if args.plot_corr:

            corr_y = [y for y in args.y if final_df[y].nunique() > 1]

            corr_plot_file = '{}_corr.{}'.format(args.out_prefix,
                                                 args.plot_ext)
            plot_corr(corr_plot_file, final_df, x=corr_y, y=corr_y)

            for hue in args.x + ['model_name']:
                corr_plot_file = '{}_corr_{}.{}'.format(
                    args.out_prefix, hue, args.plot_ext)
                plot_corr(corr_plot_file,
                          final_df,
                          x=corr_y,
                          y=corr_y,
                          hue=hue)
# ## 1 - [Data cleaning](#ch1)

# <a id="ch0"></a>
# ## 0 - Import libraries and files

# In[1]:

import pandas as pd
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

# In[2]:

data = pd.read_csv("energie_mensuel.csv", sep=';')
pd.set_option('display.max_columns', None)
data = data.replace('È', 'é',
                    regex=True)  # Les "é" sont écrits comme des "È" dans le
# fichier initial
data.head(3)

# <a id="ch1"></a>
# ## 1 - Data cleaning

# In[3]:

# Keep "France" rows
data = data.loc[data.Territoire == "France"]
data = data[["Mois", "Consommation totale"]]
data.columns = ["date", "consumption"]
Esempio n. 37
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Date    : 2018-10-13 14:33:49
# @Author  : Michael ([email protected])

import pandas as pd
pd.set_option('expand_frame_repr', False)
# pd.set_option('diaplay.max_rows', 1000)


def transfer_to_period_data(df, rule_type='15T'):
    """
	将数据转换为其它周期的数据
	:param df:
	:param rule_type:
	:return:
	"""

    # ===转换为其它分钟数据
    period_df = df.resample(rule=rule_type,
                            on='candle_begin_time',
                            label='left',
                            closed='left').agg({
                                'open': 'first',
                                'high': 'max',
                                'low': 'min',
                                'close': 'last',
                                'volume': 'sum'
                            })

    period_df.dropna(subset=['open'], inplace=True)
Esempio n. 38
0
#Convert the format so that they can be pushed to the SQL
people_tuple = [tuple(i) for i in people.to_numpy()]
friend_tuple = [
    tuple(map(int, i[0].split(" "))) for i in friend.values.tolist()
]

# Connecting to the database file
conn = sqlite3.connect('p2.db')
c = conn.cursor()

#Creating table for people/pushing the data into the table
c.execute("DROP TABLE IF EXISTS people")
c.execute('''CREATE TABLE people (personId INTEGER,name text)''')
c.executemany('INSERT INTO people VALUES (?,?)', people_tuple)

#Creating table for friends/pushing the data into the table
c.execute("DROP TABLE IF EXISTS friends")
c.execute('''CREATE TABLE friends (personId1 INTEGER,personId2 INTEGER)''')
c.executemany('INSERT INTO friends VALUES (?,?)', friend_tuple)

#SQL Query that querys NumOfFriends
pd.set_option('display.max_rows', None)
print(
    pd.read_sql_query(
        "Select Name as Name, Count(PersonId1) as NumOfFriends From (Select personId1 From friends UNION ALL Select PersonId2 From friends) As P JOIN people on P.personId1=people.PersonId Group by Name Order by NumOfFriends DESC",
        conn))
#Committing changes and closing the connection to the database file

conn.commit()
conn.close()
Esempio n. 39
0
# -*- coding: utf-8 -*-
"""
@Time : 2017/6/3 - 15:02
@Auther : Hao Chen
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


if __name__ == "__main__":
    pd.set_option('display.width', 300)

    data = pd.read_csv('../dataset/tel.csv', skipinitialspace=True, thousands=',')    # thousands : str, default None 千分位分割符,如“,”或者“."
    print u'原始数据:\n', data.head(10)

    # print 'data.columns() = \n', data.columns

    # 将每列数据按照类别做Label,比如Married和Unmarried这两个值分别用0和1取代
    le = LabelEncoder()     # 编码标签值介于0和n 比如有5类则标签为0/1/2/3/4
    for col in data.columns:
        data[col] = le.fit_transform(data[col])    # 符合标签编码的返回编码标签

    print u'处理后数据1:\n', data.head(10)

    # 年龄分组
    # 将age这列的数据按照给定的bins半开区间做标记,比如年龄在[-1,6)标记为0;[6,12)标记为1;[12,18)标记为2 ;这里标记可以自己指定,但要和bins的取值个数一样
Esempio n. 40
0
# -*- coding:utf-8 -*-
# @FileName  :floyd.py
# @Time      :2021/6/2 22:24
# @Author    :zyt

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)  # 展示所有列
pd.set_option('display.max_rows', None)  # 展示所有行


def read_data(filepath: str):
    """读取数据"""
    station_df = pd.read_excel(filepath, sheet_name='station', usecols='A: C', index_col='code')
    arcs_df = pd.read_excel(filepath, sheet_name='arcs', usecols='A: J')
    arcs_df = arcs_df[arcs_df['type'] == 'real'].set_index(['point_up_code', 'point_down_code'], drop=False)
    arcs_df['price/arc'] = arcs_df['mileage'] * arcs_df['price']

    return station_df, arcs_df


def build_matrix(station_df, arcs_df):
    """构建路径长度矩阵"""
    # 初始化里程矩阵mileage_matrix和路径矩阵path_matrix
    node_num = len(station_df)
    matrix_index = list(station_df.index)
    mileage_matrix = pd.DataFrame(np.full((node_num, node_num), np.inf), index=matrix_index, columns=matrix_index)
    path_matrix = pd.DataFrame(np.full((node_num, node_num), '-'), index=matrix_index, columns=matrix_index)

    # 更新里程矩阵mileage_matrix
    for index, arc in arcs_df.iterrows():  # 索引是路径的(起点,终点)
Esempio n. 41
0
#!/usr/bin/env python3

#################################################################################################################
## This script creates HDF5 files from corresponding SQLite db files in one-to-one fashion. So, for each db file 
## present in the input folder, equivalent HDF5 file is created in the output folder. To combine db files into
## a single HDF5 file , use version 1 of script named db_hdf5_v1.py instead
#################################################################################################################

import sqlite3, sys, glob, os, argparse, errno
import pandas as pd
from glob import glob as g
pd.set_option('io.hdf.default_format','table')  # Commenting this line out will write HDF5 as a fixed format, and not as a table format
                                                # Writing as a fixed format is faster than writing as a table, but the file cannot be 'modified/appended to' later on 
DB_SUFFIX = '.db'
 
# Function to check for existing directories, and create a new one if not present 
def dir_check(d):
    if os.path.exists(d):
        reply = input("Specified output directory already exists!! Delete existing directory named <<"+os.path.basename(d)+">> and all its contents? [y/n] ")
        if reply in ['y', 'Y', 'yes']:
            try:
                os.system('rm -r '+ d)
                print("Directory named <<"+os.path.basename(d)+ ">> and all its contents deleted!!")
                # Make new output folder
                try:
                    os.makedirs(d)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise                
            except:
                error("- Could not delete directory <<" +os.path.basename(d)+">>. Directory may contain additional files, remove files manually and try again!")
Esempio n. 42
0
    def populate_indicators(self, dataframe: DataFrame,
                            metadata: dict) -> DataFrame:
        """
        Adds several different TA indicators to the given DataFrame

        Performance Note: For the best performance be frugal on the number of indicators
        you are using. Let uncomment only the indicator you are using in your strategies
        or your hyperopt configuration, otherwise you will waste your memory and CPU usage.
        :param dataframe: Dataframe with data from the exchange
        :param metadata: Additional information, like the currently traded pair
        :return: a Dataframe with all mandatory indicators for the strategies
        """
        informative_time_frame = '1d'
        informative = None

        if not self.dp:
            # Don't do anything if DataProvider is not available.
            return dataframe

        if self.dp:
            if self.dp.runmode.value in ('live', 'dry_run'):
                now = datetime.utcnow()
                time = pd.Timestamp(year=now.year,
                                    month=now.month,
                                    day=now.day,
                                    tz="GMT+0")

                ticker = self.dp.ticker(metadata['pair'])
                new_row = {
                    'date': time,
                    'open': 1,
                    'high': 1,
                    'low': 1,
                    'close': ticker['last'],
                    'volume': 1
                }

                # Get the informative pair
                informative = self.dp.get_pair_dataframe(
                    pair=metadata['pair'], timeframe=informative_time_frame)
                informative = informative.append(new_row, ignore_index=True)

        # if not informative:
        #     return dataframe

        # calculate the bollinger bands with 1d candles
        bollinger = qtpylib.bollinger_bands(informative['close'],
                                            window=3,
                                            stds=1)
        informative[f'bb_lowerband1'] = bollinger['lower']
        informative[f'bb_middleband1'] = bollinger['mid']
        informative[f'bb_upperband1'] = bollinger['upper']

        # Rename columns to be unique
        # Assuming inf_tf = '1d' - then the columns will now be:
        # date_1d, open_1d, high_1d, low_1d, close_1d
        informative.columns = [
            f"{col}_{informative_time_frame}" for col in informative.columns
        ]

        # sync up dates
        # informative[f'date_{informative_time_frame}'] = pd.to_datetime(informative[f'date_{informative_time_frame}'], utc=True)
        # dataframe['date'] = pd.to_datetime(dataframe['date'], utc=True)

        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 300)
        logger.info(
            f'---------Informative Pair: {metadata["pair"]}-------------------'
        )
        path = os.path.normpath(
            os.path.abspath(
                os.path.join(
                    os.getcwd(), 'user_data',
                    f"dataframe_{metadata['pair'].replace('/', '')}.csv")))
        logger.info(path)
        file = open(path, "w")
        file.write(dataframe.to_csv())
        file.close()

        logger.info(f'\n\n{informative.to_markdown()}')

        # Combine the 2 dataframes
        # all indicators on the informative sample MUST be calculated before this point
        dataframe = dataframe.merge(informative,
                                    left_on='date',
                                    right_on=f'date_{informative_time_frame}',
                                    how='left')

        # FFill to have the 1d value available in every row throughout the day.
        # Without this, comparisons would only work once per day.
        # dataframe = dataframe.ffill()

        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 300)
        logger.info(
            f'---------Dataframe Pair: {metadata["pair"]}-------------------')
        logger.info(f'\n\n{dataframe.to_markdown()}')

        return dataframe
Esempio n. 43
0
    # mpl.rcParams['font.family'] = "serif"
    # mpl.rcParams['font.serif'] = "cm"
    mpl.rcParams[
        "text.latex.preamble"] = r"\usepackage{subdepth}, \usepackage{type1cm}"

# PANDAS
# ======
try:
    import pandas as pd

    _pandas_present = True
except ImportError:
    _pandas_present = False

if _pandas_present and ipython:
    pd.set_option("display.latex.repr", False)
    pd.set_option("display.latex.longtable", False)
    pd.set_option("display.latex.escape", False)

# SYMPY
# =====
try:
    import sympy as sym

    _sympy_present = True
except ImportError:
    _sympy_present = False
if _sympy_present:
    sym.init_printing(use_latex=True)

# IMAGE ARRANGEMENT with PIL
Esempio n. 44
0
"""

import json
from collections import OrderedDict, defaultdict

from django.http import JsonResponse
from django.shortcuts import get_object_or_404

from otree.views.admin import SessionData, SessionDataAjax
from otree import export
from otree.common import get_models_module
from otree.db.models import Model
from otree.models.participant import Participant
from otree.models.session import Session
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 180)

#%% helper functions


def _rows_per_key_from_queryset(qs, key):
    """Make a dict with `row[key] -> [rows with same key]` mapping (rows is a list)."""
    res = defaultdict(list)

    for row in qs.values():
        res[row[key]].append(row)

    return res

import altair as alt
import pandas as pd

from infra.constants import MAX_DATE, MIN_DATE
import infra.parsers
import infra.dask
import infra.pd


# Module specific format options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)


def compute_user_currency_histories():
    """Compute the normalized ledger with running user currency balance.
    """
    # Extract data from the transactions file into a resolved pandas frame
    # Importantly, use the timezone adjusted log but NOT the trimmed log to
    # avoid clipping state from early users.
    transactions = infra.dask.read_parquet(
        "data/internal/transactions_TZ"
    ).compute()

    # Split transfers into positive components for the dest and negative for
    # the source
    transfers = transactions.loc[
        (transactions["kind"] == "user_transfer") |
        (transactions["kind"] == "admin_transfer")
# -*- coding: utf-8 -*-
# @Author  : AlwaysDazz
# @Time    : 2021/5/23 14:13
# @IDE:    : PyCharm
# @Project : pythonProject
# @Comment :随机森林决策树,知乎:https://zhuanlan.zhihu.com/p/58945933

from sklearn.ensemble import RandomForestClassifier as rdclf  #sklearn的集成算法都在ensemble模块中,我们导入随机森林算法
from sklearn.tree import DecisionTreeClassifier as clf  #决策树
from sklearn.datasets import load_wine  #导入红酒数据集
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt  #导入画图工具
from sklearn.model_selection import cross_val_score as cvs
import pandas as pd

pd.set_option('display.max_columns', 1000)  #显示输出栏最大列数
pd.set_option('display.width', 1000)  #显示输出栏最宽行数
pd.set_option('display.max_colwidth', 1000)  #显示输出栏最大列宽
winedata = load_wine().data  #红酒数据
winetarget = load_wine().target  #红酒标签

# Xtrain,Xtest,Ytrain,Ytest=train_test_split(winedata,winetarget,test_size=0.3) #测试 训练3 7分
# clf=clf(random_state=0)#实例化决策树和随机森林
# rdclf=rdclf(random_state=0)
# clf=clf.fit(Xtrain,Ytrain) #将决策树与随机森林进行训练
# rdclf=rdclf.fit(Xtrain,Ytrain)
# clf_t=clf.score(Xtest,Ytest) #将训练好的模型进行测试
# rdclf_t=rdclf.score(Xtest,Ytest)
# print('Single Tree:{}'.format(clf_t),'Random Forest Tree:{}'.format(rdclf_t)) #格式化输出Single Tree:0.9259259259259259 Random Forest Tree:0.9814814814814815

#使用交叉验证法 实例化决策树和随机森林, 自动划分测试集和训练集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import set_option
import os

set_option('display.width', 2000)
pd.set_option("display.max_rows", 500, "display.max_columns", 2000)
set_option('precision', 3)
#set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

input_file = './raw data_edit/data_merge.csv'    # The well name of an input file
data_input_ori = pd.read_csv(input_file)
j=0
for i in range(data_input_ori.shape[0]):
    sumbox_clc = data_input_ori.commun[i] + data_input_ori.homehobb[i] + data_input_ori.judgment[i] + \
        data_input_ori.memory[i] + data_input_ori.orient[i] + data_input_ori.perscare[i]
    if sumbox_clc != data_input_ori.sumbox[i]:
        #print(sumbox_clc, data_input_ori.sumbox[i])
        data_input_ori.sumbox[i] = sumbox_clc
        j = j+1
print(j)

#data_input = data_input_ori[(data_input_ori.cdr < 1.0) | (data_input_ori.sumbox > 2.0)]
data_input = data_input_ori

M, N = data_input.shape
print(M, N)
keys = data_input.keys()
print(keys)
Esempio n. 48
0
def HorseForm(SSOID, BestOrWorst, placeBets, SelIndex):
    Rating = float(0)
    Index = float(0)
    if (BestOrWorst == "Best"):
        FormRatingAvg = float(100)
    else:
        FormRatingAvg = float(0)
    FormRatingList = []
    FormRatingListSort = []
    FormRatingEndList = []
    FormList = []
    FormEndList = []
    horsename = []
    selectionID = []

    eventTypeID = '["7"]'  #ID for Horse Racing
    countryCode = '["GB","IE"]'  #Country Codes. Betfair use Alpha-2 Codes under ISO 3166-1
    marketTypeCode = '["WIN"]'  #Market Type
    MarketStartTime = datetime.datetime.now().strftime(
        '%Y-%m-%dT%H:%M:%SZ')  #Event Start and End times
    MarketEndTime = (datetime.datetime.now() + datetime.timedelta(hours=24))
    MarketEndTime = MarketEndTime.strftime('%Y-%m-%dT%H:%M:%SZ')
    maxResults = str(1000)
    sortType = 'FIRST_TO_START'  #Sorts the Output
    Metadata = 'RUNNER_METADATA'  #Provides metadata
    inplay = 'false'  #still to run
    priceProjection = '["EX_BEST_OFFERS"]'  #Best odds

    #Create an empty dataframe
    d = {
        'Horse Name': [],
        'Horse Id': [],
        'Form': [],
        'Race': [],
        'Time': [],
        'Venue': [],
        'MarketId': [],
        'Odds': [],
        'Bet Placed': []
    }
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)
    pd.set_option('expand_frame_repr', False)
    Results = pd.DataFrame(data=d)

    headers = {
        'X-Application': my_app_key,
        'X-Authentication': SSOID,
        'content-type': 'application/json'
    }

    user_req = '{"jsonrpc": "2.0", "method": "SportsAPING/v1.0/listMarketCatalogue",\
           "params": {"filter":{"eventTypeIds":' + eventTypeID + ',"marketTypeCodes":' + marketTypeCode + ',\
           "inPlayOnly":' + inplay + ', "marketCountries":' + countryCode + ',\
           "marketStartTime":{"from":"' + MarketStartTime + '", "to":"' + MarketEndTime + '"}},\
           "sort":"' + sortType + '", "maxResults":"' + maxResults + '", "marketProjection":["' + Metadata + '","MARKET_START_TIME","EVENT"]}, "id": 1}'

    #print (user_req)
    req = urllib.request.Request(bet_url,
                                 data=user_req.encode('utf-8'),
                                 headers=headers)
    response = urllib.request.urlopen(req)
    jsonResponse = response.read()
    pkg = jsonResponse.decode('utf-8')
    result = json.loads(pkg)
    marketCatelogue = result['result']

    for x in range(len(marketCatelogue)):
        for w in range(len(marketCatelogue[x]['runners'])):
            runnerform = marketCatelogue[x]['runners'][w]['metadata']['FORM']
            if runnerform is None:
                runnerform = 'e'

            runnerformrev = runnerform[::-1]

            runnerformList = list(runnerformrev)

            Index = float(0)
            Rating = float(0)
            factor = 4
            for Entry in runnerformList:
                if (factor > 1):
                    factor = factor - 1
                if Entry == 'R':  #refusal to jump hurdle
                    Rating = float(Rating) + (float(5) * float(factor))
                    Index = Index + factor
                elif Entry == 'e':  #First Race
                    Rating = float(Rating) + (float(10) * float(factor))
                    Index = Index + factor
                elif Entry == '0':  #finished higher than 9th
                    Rating = float(Rating) + (float(10) * float(factor))
                    Index = Index + factor
                elif Entry == 'F':  #fell
                    Rating = float(Rating) + (float(5) * float(factor))
                    Index = Index + factor
                elif Entry == 'U':  #unseated rider
                    Rating = float(Rating) + (float(3) * float(factor))
                    Index = Index + factor
                elif Entry == 'x':  #horse has not started in a race for 3 months or more
                    Rating = float(Rating) + (float(3) * float(factor))
                    Index = Index + factor
                elif Entry == 'C':  #horse has won before at this same race distance and track.
                    Rating = float(Rating) + (float(.5) * float(factor))
                    Index = Index + factor
                elif Entry == 'B':  #horse started favorite at it's last start, but it did not win
                    Rating = float(Rating) + (float(3.5) * float(factor))
                    Index = Index + factor
                elif Entry == '/':  #represents two seasons ago
                    #Rating = float(Rating) + (float(8) * float(factor))
                    #Index = Index + factor
                    Index = Index
                elif Entry == '-':  #represents one season ago
                    #Rating = float(Rating) + (float(4) * float(factor))
                    Index = Index
                elif Entry == 'P':  #pulled up by jockey
                    Rating = float(Rating) + (float(4) * float(factor))
                    Index = Index + factor
                elif Entry == 'S':  #horse slipped up
                    Rating = float(Rating) + (float(4) * float(factor))
                    Index = Index + factor
                elif Entry == 'C':  #horse carried offcourse
                    Rating = float(Rating) + (float(4) * float(factor))
                    Index = Index + factor
                elif Entry == 'O':  #horse ran offcourse
                    Rating = float(Rating) + (float(10) * float(factor))
                    Index = Index + factor
                elif Entry == 'D':  #horse disqualified
                    Rating = float(Rating) + (float(7) * float(factor))
                    Index = Index + factor
                else:
                    try:
                        Rating = float(Rating) + (float(Entry) * float(factor))
                    except:
                        Rating = float(Rating) + (float(5) * float(factor))
                    Index = Index + factor

            rating = float(Rating) / float(Index)
            FormList.append(runnerform)
            FormRatingList.append(rating)
            FormRatingListSort.append(rating)

        FormRatingListSort.sort()

        for zz in range(len(FormRatingListSort)):
            for t in range(len(FormRatingList)):
                if FormRatingList[t] == FormRatingListSort[zz]:
                    horsename.append(
                        marketCatelogue[x]['runners'][t]['runnerName'])
                    selectionID.append(
                        marketCatelogue[x]['runners'][t]['selectionId'])
                    FormRatingEndList.append(str(FormRatingList[t]))
                    FormEndList.append(FormList[t])

        try:
            price_req = '{"jsonrpc": "2.0", "method": "SportsAPING/v1.0/listRunnerBook", "params": {"locale":"en", \
                    "marketId":"' + str(marketCatelogue[x]['marketId']) + '",\
                    "selectionId":"' + str(selectionID[SelIndex]) + '",\
                    "priceProjection":{"priceData":' + priceProjection + '},"orderProjection":"ALL"},"id":1}'

            #print (price_req)
            req = urllib.request.Request(bet_url,
                                         data=price_req.encode('utf-8'),
                                         headers=headers)
            price_response = urllib.request.urlopen(req)
            price_jsonResponse = price_response.read()
            price_pkg = price_jsonResponse.decode('utf-8')
            price_result = json.loads(price_pkg)
            #print (price_result)

            #print (horsename)
            start_time = marketCatelogue[x]['marketStartTime']
            my_datetime = datetime.datetime.strptime(start_time,
                                                     '%Y-%m-%dT%H:%M:%S.000Z')
            StartTime = my_datetime.strftime('%H:%M')
            venue = marketCatelogue[x]['event']['venue']
            price = float(price_result['result'][0]['runners'][0]['ex']
                          ['availableToLay'][0]['price'])
            marketId = str(marketCatelogue[x]['marketId'])
            horseId = str(selectionID[SelIndex])
            if ((price < 10.0) and (placeBets == "y")):
                PlaceBet(SSOID, marketId, horseId, str(price), "2")
            betPlaced = CheckBet(SSOID, marketId)

            Results = Results.append(
                {
                    'Horse Name':
                    str(horsename[SelIndex]),
                    'Horse Id':
                    str(selectionID[SelIndex]),
                    'Form':
                    str(FormEndList[SelIndex]),
                    'Race':
                    str(marketCatelogue[x]['marketName']),
                    'Time':
                    str(StartTime),
                    'Venue':
                    str(venue),
                    'MarketId':
                    str(marketCatelogue[x]['marketId']),
                    'Odds':
                    str(price_result['result'][0]['runners'][0]['ex']
                        ['availableToLay'][0]['price']),
                    'Bet Placed':
                    betPlaced
                },
                ignore_index=True)
        except:
            pass
            #print ("Got an error")

        Rating = float(0)
        Index = float(0)
        if (BestOrWorst == "Best"):
            FormRatingAvg = float(100)
        else:
            FormRatingAvg = float(0)
        FormRatingList.clear()
        FormRatingListSort.clear()
        FormList.clear()
        FormRatingEndList.clear()
        FormEndList.clear()
        horsename.clear()
        selectionID.clear()

    return Results
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree, decomposition, ensemble, preprocessing
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.random_projection import sparse_random_matrix
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2,mutual_info_classif,RFE
from cross_validation import KFoldCrossValidation
from naive_bayes import BernoulliNaiveBayes

# nltk.download('wordnet')
pd.set_option('display.max_columns', None)
pd.set_option("display.precision", 3)

data_train = pd.read_csv('./data/reddit_train.csv')

train_com = data_train.comments
label = data_train.subreddits


stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

# ##############################################################################################
# #                               Noise Reduction
# ##############################################################################################
Esempio n. 50
0
import pickle as pk
from sklearn.metrics import confusion_matrix, accuracy_score,average_precision_score,classification_report,f1_score
#import urllib.parse
from werkzeug.utils import secure_filename
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import seaborn as sns
from flask_restful import Api,Resource
from flask import jsonify
import json

UPLOAD_FOLDER = './uploads_f/'#'/uploads_f'
ALLOWED_EXTENSIONS = set(['tsv','csv'])
pd.set_option('display.max_colwidth', -1)
app = Flask(__name__)
api=Api(app)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER




@app.route('/predictsap', methods=["GET","POST","PUT"])
def summary():
    if request.method == "POST":
        if request.get_json():
            req = request.get_json()
            dataframe = pd.DataFrame.from_dict(req, orient="index")
            print(dataframe)
            dataframe['Debit_CreditInd.']=dataframe['Debit_CreditInd.'].apply(lambda x: 1 if x == 'S' else 0)
Esempio n. 51
0
# coding: utf-8

# # Data types and missing data reference
# 
# This is the reference section of the "Data types and missing data" section of the tutorial. For the workbook, click [here](https://www.kaggle.com/residentmario/data-types-and-missing-data-workbook).
# 
# In this short section we will look at two inter-related concepts, data types and missing data. This section draws from the [Intro to data structures](https://pandas.pydata.org/pandas-docs/stable/dsintro.html) and [Working with missing data](https://pandas.pydata.org/pandas-docs/stable/missing_data.html) sections of the comprehensive official tutorial.

# In[ ]:


import pandas as pd
reviews = pd.read_csv("../input/winemag-data-130k-v2.csv", index_col=0)
pd.set_option('max_rows', 5)


# ## Data types
# 
# The data type for a column in a `DataFrame` or a `Series` is known as the `dtype`.
# 
# You can use the `dtype` property to grab the type of a specific column:

# In[ ]:


reviews.price.dtype


# Alternatively, the `dtypes` property returns the `dtype` of _every_ column in the dataset:
Esempio n. 52
0
import random

import os
import pandas as pd
from logbook import TestHandler
from pandas.util.testing import assert_frame_equal

from catalyst import get_calendar
from catalyst.exchange.exchange_asset_finder import ExchangeAssetFinder
from catalyst.exchange.exchange_data_portal import DataPortalExchangeBacktest
from catalyst.exchange.utils.exchange_utils import get_candles_df
from catalyst.exchange.utils.factory import get_exchange
from catalyst.exchange.utils.test_utils import output_df, \
    select_random_assets

pd.set_option('display.expand_frame_repr', False)
pd.set_option('precision', 8)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


class TestSuiteBundle:
    @staticmethod
    def get_data_portal(exchanges):
        open_calendar = get_calendar('OPEN')
        asset_finder = ExchangeAssetFinder(exchanges)

        exchange_names = [exchange.name for exchange in exchanges]
        data_portal = DataPortalExchangeBacktest(
            exchange_names=exchange_names,
            asset_finder=asset_finder,
import pandas as pd
from scipy.spatial.distance import euclidean
import numpy as np
from src.data_labels import APARTMENT_WALKS_BAD, APARTMENT_WALKS_IBO, APARTMENT_WALKS_RICCI
from src.fast_dtw import dtw_plot
from fastdtw import fastdtw
from src.sensor import SensorData
import matplotlib.pyplot as plt
import os
PANDAS_WIDTH = 150
pd.set_option('display.width', PANDAS_WIDTH)

if __name__ == '__main__':
    data_sets = [
        APARTMENT_WALKS_BAD, APARTMENT_WALKS_RICCI, APARTMENT_WALKS_IBO
    ]
    trainings = []
    tests = []

    # populate trainings and tests with our files
    for chosen_folder in data_sets:
        files = os.listdir(chosen_folder)
        get_test = True
        for csv_file in files:
            name, ext = os.path.splitext(csv_file)
            if ext != '.csv':
                continue
            csv_file = os.path.join(chosen_folder, csv_file)
            sensordata = SensorData(csv_file)
            if get_test:
                print 'Added to Test', name
Esempio n. 54
0
                       between='fb_type',
                       padjust='fdr_bh')

fig, axes = plt.subplots(2, 2, figsize=(9, 4))
metric_types = ['magnitude', 'n_spindles', 'amplitude', 'duration']

p_all = np.zeros((4, 4))
for j_metric_type, metric_type in enumerate(metric_types):

    df_metric_type = stats_df_all.query(
        'metric_type=="{}"'.format(metric_type))
    for j_fb_type, fb_type in enumerate(fb_types):
        ax = axes[j_metric_type // 2, j_metric_type % 2]
        df = df_metric_type.query('fb_type=="{}"'.format(fb_type))

        pd.set_option('display.max_columns', 500)
        res = ttest(df.query('baseline=="After"')['metric'],
                    df.query('baseline=="Before"')['metric'],
                    paired=True)
        # res = pairwise_ttests(df, dv='metric', within='baseline', subject='subj_id')
        p = res['p-val'].values[0]
        p_all[j_fb_type, j_metric_type] = p
        res_str = '$p_u$={:.3f}\n'.format(
            p) + r'$Diff_{CI95}$=' + '[{}, {}]'.format(*res['CI95%'].values[0])

        x_before = df.query('baseline=="Before"')['metric'].values
        x_after = df.query('baseline=="After"')['metric'].values
        for j in range(len(x_before)):
            pair = np.array([x_before[j], x_after[j]])
            ax.plot(np.array([0, 2]) + 3 * j_fb_type,
                    pair,
 def train_predict(self, data, time_budget,n_class,schema):
     s1 = time.time()
     seed = SEED
     fix_seed(seed)
     LOGGER.info(f'time_budget:{time_budget}')
     LOGGER.info(f'n_class:{n_class}')
     LOGGER.info(f'node:{data["fea_table"].shape[0]}')
     LOGGER.info(f'edge:{data["edge_file"].shape[0]}')
     
     #pre-process data
     process_data = ProcessData(data)
     table = process_data.pre_process(time_budget,n_class,schema)
     
     # Feature Dimension Reduction
     feat = Feat()
     
     process_data.drop_unique_columns(table)
     drop_sum_columns = process_data.drop_excessive_columns(table)
     
     feat.fit_transform(table,drop_sum_columns)
     LOGGER.info(f'train:test={(table.df["is_test"]!=1).sum()}:{(table.df["is_test"]==1).sum()}')
     
     #这里好像没用到哦
     table.large_features = False
     if table.ori_columns.shape[0]>500:
         table.large_features = True
     
     model_type_list = ['sage','gat','tagc','gcn']
     
     repeat = 5
     model_name_list = [f'{model_type_list[i]}{i+len(model_type_list)*j}' for j in range(repeat) for i in range(len(model_type_list))]
     model_type_list = model_type_list*repeat
     
     LOGGER.info('use node embedding')
     categories = ['node_index','degree_bins','bin_2-neighbor_mean_degree_bins']        
     
     for model in set(model_type_list):
         LOGGER.info(f"""{model} feature num:{eval(f'table.{model}_columns.shape[0]')}""")
         exec(f'table.{model}_data = process_data.process_gnn_data(table,table.{model}_columns,categories)') 
     
     allmodel = AllModel()
     
     table.lr_epoch = 16
     
     table.lr_list = [0.05,0.03,0.01,0.0075,0.005,0.003,0.001,0.0005]
     
     train_valid_idx_list,valid_idx_list = split_train_and_valid(table,train_rate=0.8,seed=SEED,mode=split_mode)
     train_idx,test_idx = split_train_and_test(table)
     
     test_idx = test_idx.sort_values()
     run_model = []
     run_type = []
     run_time = {}
     for i in range(len(model_type_list)):
         seed = SEED*(i+1)
         fix_seed(seed)
         model_type = model_type_list[i]
         model_name = model_name_list[i]
         if model_type not in run_time:
             init_time, one_epoch_time, early_stopping_rounds = allmodel.get_run_time(table, model_type, model_name, train_idx,test_idx, seed=seed)
             run_lr_time = len(table.lr_list)*(init_time+table.lr_epoch*one_epoch_time)
             run_time500 = init_time*(2)+one_epoch_time*(500+early_stopping_rounds)*2+run_lr_time
             run_time300 = init_time*(2)+one_epoch_time*(300+early_stopping_rounds)*2+run_lr_time
             run_time150 = init_time*(2)+one_epoch_time*(150+early_stopping_rounds)*2+run_lr_time
             run_time[model_type] = (run_time500-run_lr_time,run_time300-run_lr_time,run_time150-run_lr_time,early_stopping_rounds,init_time,one_epoch_time,run_lr_time)
         else:
             run_time500,run_time300,run_time150,early_stopping_rounds,init_time,one_epoch_time,run_lr_time = run_time[model_type]
         s2 = time.time()
         LOGGER.info(f"time_budget:{time_budget}s,used time:{s2-s1:.2f}s,{model_name} model will use {run_time500:.2f}s|{run_time300:.2f}s|{run_time150:.2f}s")
         if s2-s1+run_time500+5<time_budget:
             LOGGER.info('train 500 epoch')
             allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=500,seed=seed)
             run_model.append(model_name)
             run_type.append(model_type)
         elif s2-s1+run_time300+5<time_budget:
             LOGGER.info('train 300 epoch')
             allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=300,seed=seed)
             run_model.append(model_name)
             run_type.append(model_type)
         elif s2-s1+run_time150+5<time_budget:
             LOGGER.info('train 150 epoch')
             allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=150,seed=seed)
             run_model.append(model_name)
             run_type.append(model_type)
         elif len(allmodel.valid_models[0])==0:
             this_epoch = int(((time_budget-(s2-s1+5)-run_lr_time)/2-init_time)/(one_epoch_time)-early_stopping_rounds)
             LOGGER.info(f'short time train {this_epoch} epoch')
             allmodel.V37_fit_transform(table, model_type, model_name,train_valid_idx_list,valid_idx_list,train_idx,test_idx,mode=split_mode,num_boost_round=this_epoch,seed=seed)
             run_model.append(model_name)
             run_type.append(model_type)
         elif time_budget-(s2-s1)<5:
             LOGGER.info('never train; break')
             break
         else:
             LOGGER.info('no train this model; continue')
             continue
     
         
     if offline:
         if table.especial:
             df = table.df[['node_index','is_test']]
             df = df.merge(data['test_label'],how='left',on='node_index')
             test_label = df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'label'].astype('int').values     
         else:
             test_label = data['test_label']['label'].values
     else:
         test_label = None
         
     
     preds1,valid_acc1 = get_preds(0,run_model,run_type,allmodel,model_name_list,table,test_label,valid_idx_list)
     preds2,valid_acc2 = get_preds(1,run_model,run_type,allmodel,model_name_list,table,test_label,valid_idx_list)
     preds = (preds1+preds2)/2
     
     preds = preds.argmax(axis=1).flatten()
     
         
     if table.especial:
         LOGGER.info(f'preds\n{preds}')
         df = table.df[['label','is_test']]
         df['preds'] = int(df.loc[[not i for i in table.directed_mask.tolist()],'label'].value_counts().index[0])
         df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'preds'] = preds
         preds = df.loc[df['is_test']==1,'preds'].values
     
     LOGGER.info(f"train label\n{data['train_label']['label'].value_counts()/data['train_label'].shape[0]}")
     df_preds = pd.Series(preds,name='preds')
     LOGGER.info(f"preds label\n{df_preds.value_counts()/df_preds.shape[0]}")
     
     if offline:
         preds1 = preds1.argmax(axis=1).flatten()
         preds2 = preds2.argmax(axis=1).flatten()
         if table.especial:
             LOGGER.info(f'preds1\n{preds1}')
             df = table.df[['label','is_test']]
             df['preds'] = int(df.loc[[not i for i in table.directed_mask.tolist()],'label'].value_counts().index[0])
             df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'preds'] = preds1
             preds1 = df.loc[df['is_test']==1,'preds'].values
             
             LOGGER.info(f'preds2\n{preds2}')
             df = table.df[['label','is_test']]
             df['preds'] = int(df.loc[[not i for i in table.directed_mask.tolist()],'label'].value_counts().index[0])
             df.loc[(df['is_test']==1)&(table.directed_mask.tolist()),'preds'] = preds2
             preds2 = df.loc[df['is_test']==1,'preds'].values
         
         df_test = table.df[['degree','label','is_test']]
         df_test = df_test.loc[df_test['is_test']==1]
         df_test['preds'] = preds
         df_test['label'] = data['test_label']['label'].values
         df_test['acc'] = df_test['preds']==df_test['label']
         
         pd.set_option('display.max_rows', 1000)
         print(df_test.groupby('degree')['acc'].mean())
         
         return preds,valid_acc1,valid_acc2,preds1,preds2
     else:
         return preds
Esempio n. 56
0
                                     key=lambda x: x not in mysubset)]

    def load_data(self, path: str):
        self.data = pd.read_csv(path, sep=';', encoding='cp1251', skiprows=1)
        self._formatting_data()

    def call_stat_by_day(self) -> pd.DataFrame:
        """
        Статистика звонков менеджеров по дням
        :return:
        """
        frm = lambda x: '{:,.2f}'.format(x)
        data = self.data.groupby([self.data['Дата'], self.data['Кто звонил']])['Длительность, сек'].\
            agg(['count', 'sum', 'mean']).\
            rename(columns={'count': 'Кол-во звонков', 'sum': 'Общая длит. звонков', 'mean': 'Сред. длит. звонков'})
        data['Сред. длит. звонков'] = data['Сред. длит. звонков'].map(
            frm).astype(float)
        return data


if __name__ == '__main__':
    pd.set_option("display.max.columns", None)
    dframe = PandasDataFrame()
    dframe.load_data('static.csv')
    df = dframe.call_stat_by_day().reset_index()

    print(df)

    df.plot(x=['Дата', 'Кто звонил'], y="Кол-во звонков")
    plt.show()
import nltk
import pandas as pd

# Remove Punctuation
# Read dataset
pd.set_option('display.max_colwidth', 100)  # We can longer display
data = pd.read_csv('../data/smsspamcollection/SMSSpamCollection', \
    sep='\t', header=None)

data.columns = ['label', 'msg']
print('set columns => data.head():')
print(data.head())

import string
print('\nstring.punctuation:', string.punctuation)


def remove_punctuation(txt):
    # for all the character in txt.
    # if is not in punctuation
    txt_nopunct = "".join([c for c in txt if c not in string.punctuation])
    return txt_nopunct


data['msg_clean'] = data['msg'].apply(lambda x: remove_punctuation(x))
print('\nremove punctuation => data.head():')
print(data.head())

# Tokenization
import re
Esempio n. 58
0
def labelquery():
    query = "select ?s ?p ?o where { ?s ?p ?o. } limit 10"
    namespace = request.cookies.get('namespace',
                                    os.environ.get("ANNOTATION_NAMESPACE"))
    template = ''
    slots = ''
    results = {}

    current_labelquery_type = request.args.get("showtype",
                                               list(labelquery_types)[0],
                                               type=str)

    modeldir = os.environ.get("MODELDIR")
    labelqueries, templates = load_labelqueries_templates(
        modeldir, filter_disabled=False)

    if request.method == 'POST':
        form = dict(request.form)
        print(form)
        action = request.form.get('action')

        if action in [
                'query', 'count', 'setquery', 'save-template',
                'save-labelquery'
        ]:
            template = request.form.get('template')
            slots = request.form.get('slots')
            label = current_labelquery_type
            value = 'http://karmaresearch.net/' + request.form.get('value', '')
            query = request.form.get('query')
            if not query:
                query = templates[request.form.get('template')]

        if action in ['query', 'count']:
            fmt_query = fill_template(query, slots)
            print(fmt_query)
            response = do_sparql(os.environ.get("KB"), fmt_query, namespace)
            if response.ok:
                results = response.json()
                if action == 'count':
                    qid = request.form.get('qid')
                    var_path_template = annotate.labelquery_types[label][
                        'path']
                    n = len(
                        LabelQuery.transformations_from_results(
                            response.json(), var_path_template, value))
                    labelqueries[qid]['scores']['n_results'] = n
                    save_labelqueries_templates(modeldir, labelqueries,
                                                templates)
            else:
                print(response.text)

        if action == 'save-template':
            templates[template] = query

            # (re-)calculate all scores for labelqueries with this template
            basedir = os.environ.get("BASEDIR")
            annotationdir = os.environ.get("ANNOTATIONDIR")
            namespace = os.environ.get("ANNOTATION_NAMESPACE")
            for qid, lq in labelqueries.items():
                if lq['template'] == template:
                    scores = eval_labelquery(templates[lq['template']],
                                             lq['slots'],
                                             lq['label'],
                                             lq['value'],
                                             basedir,
                                             annotationdir,
                                             namespace,
                                             name='')
                    print(qid, scores)
                    labelqueries[qid]['scores'] = scores

        if action == 'build-cache':
            # (re-)calculate all scores for labelqueries of this labeltype
            basedir = os.environ.get("BASEDIR")
            annotationdir = os.environ.get("ANNOTATIONDIR")
            namespace = os.environ.get("ANNOTATION_NAMESPACE")
            for qid, lq in labelqueries.items():
                if lq['label'] == current_labelquery_type and lq[
                        'template'] in templates:
                    scores = eval_labelquery(templates[lq['template']],
                                             lq['slots'],
                                             lq['label'],
                                             lq['value'],
                                             basedir,
                                             annotationdir,
                                             namespace,
                                             name='')
                    print(qid, scores)
                    labelqueries[qid]['scores'] = scores

        if action == 'delete-labelquery':
            qid = request.form.get('qid')
            del labelqueries[qid]

        if action == 'save-labelquery':
            qid = request.form.get('qid')
            enabled = request.form.get('enabled')
            labelqueries[qid] = {
                'template': template,
                'slots': slots,
                'label': label,
                'value': value,
                'enabled': enabled,
            }
            if request.form.get('template') not in templates:
                templates[request.form.get('template')] = ''

            # (re-)calculate labelquery scores
            basedir = os.environ.get("BASEDIR")
            annotationdir = os.environ.get("ANNOTATIONDIR")
            namespace = os.environ.get("ANNOTATION_NAMESPACE")
            scores = eval_labelquery(templates[template],
                                     slots,
                                     label,
                                     value,
                                     basedir,
                                     annotationdir,
                                     namespace,
                                     name='')
            print('re-calculated', scores)
            labelqueries[qid]['scores'] = scores

        if action in [
                'save-template', 'delete-labelquery', 'save-labelquery',
                'build-cache'
        ]:
            # Save the queries themselves, including scores that have possibly been updated
            save_labelqueries_templates(modeldir, labelqueries, templates)

            # Run the updated queries on the gold namespace and save the results
            os.makedirs(os.path.join(modeldir, 'labelqueries', 'cache'),
                        exist_ok=True)
            results_fname = os.path.join(modeldir, 'labelqueries', 'cache',
                                         'gold.json')
            namespace = os.environ.get("ANNOTATION_NAMESPACE")
            kbdomain = os.environ.get("KB")
            if action == 'save-template':
                selected_queries = [
                    str(qid) for qid, lq in labelqueries.items()
                    if lq['template'] == template
                ]
            elif action == 'build-cache':
                selected_queries = [
                    str(qid) for qid, lq in labelqueries.items()
                    if lq['label'] == current_labelquery_type
                    and lq['template'] in templates
                ]
            else:
                selected_queries = [str(qid)]
            import supervise
            labelquery_results = supervise.cache_labelquery_results(
                modeldir,
                namespace,
                kbdomain,
                selected_queries=selected_queries,
                results_fname=results_fname,
                verbose=True)

            # Save the entire labeling matrix using the query results
            basedir = os.environ.get("BASEDIR")
            annotationdir = os.environ.get("ANNOTATIONDIR")
            labeled_metas = list(annotate.get_metadata(basedir, annotationdir))
            supervise.save_query_analysis(modeldir, labeled_metas,
                                          current_labelquery_type)

    view = request.args.get("view", None, type=str)
    if view:
        os.makedirs(os.path.join(modeldir, 'labelqueries', 'cache'),
                    exist_ok=True)
        view_fname = os.path.join(modeldir, 'labelqueries', 'cache',
                                  f'{current_labelquery_type}-{view}.csv')
        if os.path.exists(view_fname):
            pd.set_option('display.max_colwidth', -1)
            view = pd.read_csv(view_fname)
            view.columns = [
                c + '<a class="sort_btn" href="#"/>' for c in view.columns
            ]

            def make_url(c):
                c = 'http://karmaresearch.net/' + c if c != 'None' else c
                return url_for('view',
                               label=c,
                               showtype=current_labelquery_type)

            view[view.columns[0]] = [
                f'<a href="{make_url(c)}">{c}</a>'
                for c in view[view.columns[0]]
            ]
            view = view.to_html(index=False,
                                float_format='%.2f',
                                border=0,
                                escape=False,
                                classes=['sortable'])
        else:
            view = f'{view} file does not exist'

    return render_template(
        'labelquery.html',
        query=query,
        template=template,
        slots=slots,
        view=view,
        results=results,
        labelqueries=labelqueries,
        labelquery_types=labelquery_types,
        current_labelquery_type=current_labelquery_type,
        templates=templates,
        classes=sorted([
            c.replace('http://karmaresearch.net/', '') for c in get_classes()
        ]),
        new_qid=str(max([int(i) for i in labelqueries], default=0) + 1),
        namespace=request.cookies.get('namespace',
                                      os.environ.get("ANNOTATION_NAMESPACE")),
        all_namespaces=get_namespaces(os.environ.get("KB")),
    )
Esempio n. 59
0
import numpy as np
import tensorflow as tf
from datetime import datetime
import os 
import sys
import random
import math
import numpy as np
import cv2
import matplotlib.pyplot as plt
plt.rcParams["font.size"] = 15
#import seaborn as sns
import json
from tqdm import tqdm
import pandas as pd
pd.set_option("display.max_rows", 101)
import glob
from collections import Counter
from PIL import Image

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"


#input_dir = "./advancedML/data/"
'''
One Dataset is around more than 200mb
The size of one batch is mainly determined by batch size, image size, maximum object number
batch size = IMAGES_PER_GPU*GPU_COUNT in config.py
image size is related to MAX_IMG_DIM,MIN_IMG_DIM in config.py
maximum object numbers = MAX_GT_OBJECTS in config.py
Esempio n. 60
0
                num_vow += 1
            else:
                num_con += 1
            if (num_con > 0 and num_vow > 0) and (num_con > 1 or num_vow > 1):
                break
            j += 1
        part2 = name2[j:len(name2)]

    new_name = part1 + part2
    return new_name


# Formatting to allow printing of an entire dataframe in Pycharm
pd.options.display.width = None
pd.options.display.max_columns = None
pd.set_option('display.max_rows', 42)
pd.set_option('display.max_columns', 42)

# Creatures the player will select from
lion = Creature("Lion", 240, 4, False, True, False, False)
python = Creature("Python", 30, 0, False, True, True, False)
dog = Creature("Dog", 90, 4, False, True, False, False)
human = Creature("Human", 160, 2, False, False, False, False)
trout = Creature("Trout", 3, 0, False, True, True, False)
eagle = Creature("Eagle", 10, 2, True, True, False, False)
dragon = Creature("Dragon", 2700, 4, True, True, True, True)
ant = Creature("Ant", 0.0000022046, 6, False, False, False, False)
octopus = Creature("Octopus", 80, 8, False, False, True, False)

creatures = [lion, python, dog, human, trout, eagle, dragon, ant, octopus]