def main(): file_name = 'data/processed_digits.csv' df = create_dataframe(file_name) X_train, y_train, X_valid, y_valid, X_test, y_test = process_data(df) DigitNN = DigitNeuralNetwork(epochs=100, batch_size=32) DigitNN.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
def main(): try: # user input for the path of the dataset filedir = input("enter the complete directory path ") filepath = input("enter the folder name") #load the files all_files = os.listdir(filedir) freq_max1,freq_max2,freq_max3,freq_max4,freq_max5 = cal_max_freq(all_files,filepath) except IOError: print("you have entered either the wrong data directory path or filepath") #load the model filename = "GMM_all.npy" model = np.load(filename).item() #checking the iteration if (filepath == "1st_test/"): rhigh = 8 else: rhigh = 4 print("for the testset", filepath) prediction_last_100 = [] for i in range(0,rhigh): #making the dataframe X = create_dataframe(freq_max1,freq_max2,freq_max3,freq_max4,freq_max5,i) print("checking for the bearing",i+1) label = model.predict(X) check_two = list(label[-100:]).count(2) ratio = (check_two/100)*100 print("prediction",ratio) if(ratio >= 50): print("bearing is suspected to fail") else: print("bearing is working in normal condition") prediction_last_100.append(label[-100:]) plotlabels(prediction_last_100) plt.show()
def main(): """ Module to execute the entire neural network model from data retrieval to model performance metrics @:param: None :return: Post process results """ # The SMILES and InChI logs of the same material have identical indices # Creating and joining the SMILES and InChI dataframes along the same index df_compounds_smiles = utils.create_dataframe( 'data/chemical_notation_data/' 'compounds_smiles.txt', 'smiles') df_compounds_inchi = utils.create_dataframe( 'data/chemical_notation_data/' 'compounds_inchi.txt', 'inchi') df_compounds = pd.concat( [df_compounds_smiles, df_compounds_inchi['INCHI']], axis=1).rename(columns={'ID': 'CID'}) activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv') for j in range(5): activity = activity.drop(j, axis=0) activity = activity.drop([ 'PUBCHEM_ACTIVITY_URL', 'PUBCHEM_RESULT_TAG', 'PUBCHEM_ACTIVITY_SCORE', 'PUBCHEM_SID', 'PUBCHEM_ASSAYDATA_COMMENT', 'Potency', 'Efficacy', 'Analysis Comment', 'Curve_Description', 'Fit_LogAC50', 'Fit_HillSlope', 'Fit_R2', 'Fit_InfiniteActivity', 'Fit_ZeroActivity', 'Fit_CurveClass', 'Excluded_Points', 'Compound QC', 'Max_Response', 'Activity at 0.457 uM', 'Activity at 2.290 uM', 'Activity at 11.40 uM', 'Activity at 57.10 uM', 'PUBCHEM_ACTIVITY_OUTCOME' ], axis=1) activity.rename(columns={'PUBCHEM_CID': 'CID'}, inplace=True) activity['dupes'] = activity.duplicated('CID') activity = activity[activity['dupes'] == 0].drop(['dupes'], axis=1) df_compounds = df_compounds.sort_values(by='CID') activity = activity.sort_values(by='CID') df = activity.merge(df_compounds) df = df.sort_values(by='CID') df = df.sample(frac=1).reset_index(drop=True) df_descriptor = utils.extract_all_descriptors(df, 'SMILES') df = df.join(df_descriptor) df.to_csv('data/descriptor_data.csv') # Type check inputs for sanity if df is None: raise ValueError('df is None') if not isinstance(df, pd.DataFrame): raise TypeError('df is not a dataframe') if TARGET_COLUMN is None: raise ValueError('target_column is None') if not isinstance(TARGET_COLUMN, basestring): raise TypeError('target_column is not a string') if TARGET_COLUMN not in df.columns: raise ValueError('target_column (%s) is not a valid column name' % TARGET_COLUMN) # Train, validation and test split df_train, df_test = train_test_split(df, test_size=0.25) df_train, df_val = train_test_split(df_train, test_size=0.333333) x_train, x_val, x_test = df_train, df_val, df_test # Remove the classification column from the dataframe x_train = x_train.drop(TARGET_COLUMN, 1) x_val = x_val.drop(TARGET_COLUMN, 1) x_test = x_test.drop(TARGET_COLUMN, 1) y_train = pd.DataFrame(df_train[TARGET_COLUMN]) y_val = pd.DataFrame(df_val[TARGET_COLUMN]) y_test = pd.DataFrame(df_test[TARGET_COLUMN])
for i in range(no_of_bearings): bearing_list.append(i) testing_bearings = list(np.setdiff1d(bearing_list, training_bearings)) # Load the model filename = "GMM_all.npy" model = np.load(filename).item() print("Testing ", dbname, "...") prediction_last_100 = [] for k in testing_bearings: x = create_dataframe(frequency_component1, frequency_component2, frequency_component3, frequency_component4, frequency_component5, k) print("\nTesting bearing ", k + 1) label = model.predict(x) check_two = list(label[-100:]).count(2) ratio = (check_two / 100) * 100 json_body = [{ "measurement": "GMM", "tags": { "type": "B" + str(k + 1), }, "fields": { "failure_ratio": ratio,
testset2_freq_dataframes = [ 'testset2_freq_comp1', 'testset2_freq_comp2', 'testset2_freq_comp3', 'testset2_freq_comp4', 'testset2_freq_comp5' ] testset2_length = int(len(testset2_temporary_df) / 5) for i, c in enumerate(testset2_freq_dataframes, 0): exec( '{} = testset2_temporary_df.iloc[i*testset2_length:i*testset2_length+testset2_length, : ]' .format(c, i)) # Labelling the bearings which are failed testset1_labelF = cal_Labels(testset1_length) testset2_labelF = cal_Labels(testset2_length) result1 = create_dataframe(testset1_freq_comp1, testset1_freq_comp2, testset1_freq_comp3, testset1_freq_comp4, testset1_freq_comp5, 7) result1['labels'] = testset1_labelF result2 = create_dataframe(testset2_freq_comp1, testset2_freq_comp2, testset2_freq_comp3, testset2_freq_comp4, testset2_freq_comp5, 0) result2['labels'] = testset2_labelF # Labelling the bearings which are passed testset1_labelP = np.array([0] * 1800) testset2_labelP = np.array([0] * 800) result3 = create_dataframe(testset1_freq_comp1, testset1_freq_comp2, testset1_freq_comp3, testset1_freq_comp4, testset1_freq_comp5, 2)
except IOError: print( "you have entered either the wrong data directory path for either testset1 or testset2" ) testset1_freq_max1, testset1_freq_max2, testset1_freq_max3, testset1_freq_max4, testset1_freq_max5 = cal_max_freq( all_files_testset1, path_testset1) testset2_freq_max1, testset2_freq_max2, testset2_freq_max3, testset2_freq_max4, testset2_freq_max5 = cal_max_freq( all_files_testset2, path_testset2) # calculating the labels for the bearing which is failed testset1_labelF = cal_Labels(len(all_files_testset1)) testset2_labelF = cal_Labels(len(all_files_testset2)) # creatine a datafRame for which bearing has failed result1 = create_dataframe(testset1_freq_max1, testset1_freq_max2, testset1_freq_max3, testset1_freq_max4, testset1_freq_max5, 1) result1['labels'] = testset1_labelF result2 = create_dataframe(testset2_freq_max1, testset2_freq_max2, testset2_freq_max3, testset2_freq_max4, testset2_freq_max5, 0) result2['labels'] = testset2_labelF # calculating the labels for the testset1,testet2, which is not failed testset1_labelP = np.array([0] * 1800) testset2_labelP = np.array([0] * 800) # creating a dataframe for which bearing is passed result3 = create_dataframe(testset1_freq_max1, testset1_freq_max2, testset1_freq_max3, testset1_freq_max4,
def histogram(): data = ul.create_dataframe() plt.hist(data)
def main(): """ Module to execute the entire neural network model from data retrieval to model performance metrics @:param: None :return: Post process results """ # The SMILES and InChI logs of the same material have identical indices # Creating and joining the SMILES and InChI dataframes along the same index df_compounds_smiles = utils.create_dataframe('data/chemical_notation_data/' 'compounds_smiles.txt', 'smiles') df_compounds_inchi = utils.create_dataframe('data/chemical_notation_data/' 'compounds_inchi.txt', 'inchi') df_compounds = pd.concat([df_compounds_smiles, df_compounds_inchi['INCHI']], axis=1).rename(columns={'ID': 'CID'}) activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv') for j in range(5): activity = activity.drop(j, axis=0) activity = activity.drop(['PUBCHEM_ACTIVITY_URL', 'PUBCHEM_RESULT_TAG', 'PUBCHEM_ACTIVITY_SCORE', 'PUBCHEM_SID', 'PUBCHEM_ASSAYDATA_COMMENT', 'Potency', 'Efficacy', 'Analysis Comment', 'Curve_Description', 'Fit_LogAC50', 'Fit_HillSlope', 'Fit_R2', 'Fit_InfiniteActivity', 'Fit_ZeroActivity', 'Fit_CurveClass', 'Excluded_Points', 'Compound QC', 'Max_Response', 'Activity at 0.457 uM', 'Activity at 2.290 uM', 'Activity at 11.40 uM', 'Activity at 57.10 uM', 'PUBCHEM_ACTIVITY_OUTCOME'], axis=1) activity.rename(columns={'PUBCHEM_CID': 'CID'}, inplace=True) activity['dupes'] = activity.duplicated('CID') activity = activity[activity['dupes'] == 0].drop(['dupes'], axis=1) df_compounds = df_compounds.sort_values(by='CID') activity = activity.sort_values(by='CID') df = activity.merge(df_compounds) df = df.sort_values(by='CID') df = df.sample(frac=1).reset_index(drop=True) df_descriptor = utils.extract_all_descriptors(df, 'SMILES') df = df.join(df_descriptor) df.to_csv('data/descriptor_data.csv') # Type check inputs for sanity if df is None: raise ValueError('df is None') if not isinstance(df, pd.DataFrame): raise TypeError('df is not a dataframe') if TARGET_COLUMN is None: raise ValueError('target_column is None') if not isinstance(TARGET_COLUMN, basestring): raise TypeError('target_column is not a string') if TARGET_COLUMN not in df.columns: raise ValueError('target_column (%s) is not a valid column name' % TARGET_COLUMN) # Train, validation and test split df_train, df_test = train_test_split(df, test_size=0.25) df_train, df_val = train_test_split(df_train, test_size=0.333333) x_train, x_val, x_test = df_train, df_val, df_test # Remove the classification column from the dataframe x_train = x_train.drop(TARGET_COLUMN, 1) x_val = x_val.drop(TARGET_COLUMN, 1) x_test = x_test.drop(TARGET_COLUMN, 1) y_train = pd.DataFrame(df_train[TARGET_COLUMN]) y_val = pd.DataFrame(df_val[TARGET_COLUMN]) y_test = pd.DataFrame(df_test[TARGET_COLUMN])
74: "CDLTASUKIGAP", 75: "CDLTHRUSTING", 76: "CDLTRISTAR", 77: "CDLUNIQUE3RIVER", 78: "CDLUPSIDEGAP2CROWS", 79: "CDLXSIDEGAP3METHODS", 80: '12ema', 81: '26ema', 82: 'upper_band', 83: 'lower_band', 84: '%K', 85: '%D', 86: "label" } SQL = "SELECT * FROM master_record_staging" df_staging = utils.create_dataframe(MYDB, SQL, NAME_DICT, FEATURES_COLS) SQL = "SELECT * FROM master_record" df_master = utils.create_dataframe(MYDB, SQL, NAME_DICT, FEATURES_COLS) continuous_columns = [ 'volume', 'numberOfTrades', 'var_ema', 'var_bollinger', 'var_stoch', 'RSI' ] categorical_columns = [ 'rsi_indicator', 'stoch_indicator', 'CDL2CROWS', 'CDL3BLACKCROWS', 'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK', 'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR', 'CDLDRAGONFLYDOJI', 'CDLENGULFING', 'CDLEVENINGDOJISTAR', 'CDLEVENINGSTAR', 'CDLGAPSIDESIDEWHITE', 'CDLGRAVESTONEDOJI', 'CDLHAMMER', 'CDLHANGINGMAN', 'CDLHARAMI',
def main(): """ Module to execute the entire package from data retrieval to model performance metrics @:param: None :return: Post process results """ # Importing inhibitor notation data # The SMILES and InChI logs of the same material have identical indices # Creating and joining the SMILES and InChI dataframes along the same index utils.check_files() df_compounds_smiles = utils.create_dataframe( 'data/chemical_notation_' 'data/compounds_smiles.txt', 'smiles') df_compounds_smiles.rename(columns={'ID': 'CID'}, inplace=True) df_compounds_smiles.sort_values(by='CID', inplace=True) # Importing inhibitor activity data activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv') activity = utils.clean_activity_dataframe(activity) # Merging activity data and compound notation data df = activity.merge(df_compounds_smiles) df.sort_values(by='CID', inplace=True) df.reset_index(drop=True, inplace=True) # Drop non-descriptor columns before feature space reduction df_target = df.drop(['SMILES', 'CID', 'Phenotype'], axis=1) # Extracting molecular descriptors for all compounds # print("Sending data for descriptor calculation") # utils.extract_all_descriptors(df, 'SMILES') # Importing feature sets df_charge = pd.DataFrame.from_csv('data/df_charge.csv') df_basak = pd.DataFrame.from_csv('data/df_basak.csv') df_con = pd.DataFrame.from_csv('data/df_con.csv') df_estate = pd.DataFrame.from_csv('data/df_estate.csv') df_constitution = pd.DataFrame.from_csv('data/df_constitution.csv') df_property = pd.DataFrame.from_csv('data/df_property.csv') df_kappa = pd.DataFrame.from_csv('data/df_kappa.csv') df_moe = pd.DataFrame.from_csv('data/df_moe.csv') print("Joining dataframes") df_descriptor = df_kappa.join(df_moe).join(df_constitution).\ join(df_property).join(df_charge).join(df_estate).join(df_con).join( df_basak) print("Joining dataframes done") print("Checking dataframe for NaN, infinite or too large values") df_descriptor = utils.remove_nan_infinite(df_descriptor) # Transform all column values to mean 0 and unit variance print("Transforming dataframe using mean and variance") df_descriptor = utils.transform_dataframe(df_descriptor) print("Transforming dataframe using mean and variance done") # Feature selection and space reduction print("Selecting best features in dataframe") df_features = utils.select_features(df_descriptor, df_target) print("Selecting best features in dataframe done") df = df_features.join(df_target) # Data to training task # Type check inputs for sanity if df is None: raise ValueError('df is None') if not isinstance(df, pd.DataFrame): raise TypeError('df is not a dataframe') if TARGET_COLUMN is None: raise ValueError('target_column is None') if not isinstance(TARGET_COLUMN, basestring): raise TypeError('target_column is not a string') if TARGET_COLUMN not in df.columns: raise ValueError('target_column (%s) is not a valid column name' % TARGET_COLUMN) # Train, validation and test split df_train, df_test = sklearn.cross_validation.train_test_split( df, test_size=0.25) # Remove the classification column from the dataframe x_train = df_train.drop(TARGET_COLUMN, 1) x_test = df_test.drop(TARGET_COLUMN, 1) y_train = pd.DataFrame(df_train[TARGET_COLUMN]) y_test = pd.DataFrame(df_test[TARGET_COLUMN]) with open(XY_PICKLE, 'wb') as results: pickle.dump(x_train, results, pickle.HIGHEST_PROTOCOL) pickle.dump(x_test, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_train, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_test, results, pickle.HIGHEST_PROTOCOL) models.run_models(x_train, y_train, x_test, y_test) post_process.results()