Python create_dataframe Examples, utils.create_dataframe Python Examples

Example #1

0

Show file

def main():
    file_name = 'data/processed_digits.csv'
    df = create_dataframe(file_name)
    X_train, y_train, X_valid, y_valid, X_test, y_test = process_data(df)

    DigitNN = DigitNeuralNetwork(epochs=100, batch_size=32)
    DigitNN.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)

Example #2

0

Show file

File: GMM_test.py Project: xe1gyq/motor-defect-detector

def main():
    try:
        # user input for the path of the dataset
        filedir = input("enter the complete directory path ")
        filepath = input("enter the folder name")

        #load the files
        all_files = os.listdir(filedir)
        freq_max1,freq_max2,freq_max3,freq_max4,freq_max5 = cal_max_freq(all_files,filepath)
    except IOError:
        print("you have entered either the wrong data directory path or filepath")


    #load the model
    filename = "GMM_all.npy"
    model = np.load(filename).item()

    #checking the iteration
    if (filepath == "1st_test/"):
        rhigh = 8
    else:
        rhigh = 4

    print("for the testset", filepath)
    prediction_last_100 = []
    for i in range(0,rhigh):
        #making the dataframe
        X = create_dataframe(freq_max1,freq_max2,freq_max3,freq_max4,freq_max5,i)
        print("checking for the bearing",i+1)
        label = model.predict(X)
        check_two = list(label[-100:]).count(2)
        ratio = (check_two/100)*100
        print("prediction",ratio)
        if(ratio >= 50):
            print("bearing is suspected to fail")
        else:
            print("bearing is working in normal condition")

        prediction_last_100.append(label[-100:])

    plotlabels(prediction_last_100)
    plt.show()

Example #3

0

Show file

File: driver.py Project: gitter-badger/USP-inhibition

def main():
    """
    Module to execute the entire neural network model from data retrieval to
    model performance metrics
    @:param: None
    :return: Post process results
    """
    # The SMILES and InChI logs of the same material have identical indices
    # Creating and joining the SMILES and InChI dataframes along the same index

    df_compounds_smiles = utils.create_dataframe(
        'data/chemical_notation_data/'
        'compounds_smiles.txt', 'smiles')
    df_compounds_inchi = utils.create_dataframe(
        'data/chemical_notation_data/'
        'compounds_inchi.txt', 'inchi')

    df_compounds = pd.concat(
        [df_compounds_smiles, df_compounds_inchi['INCHI']],
        axis=1).rename(columns={'ID': 'CID'})

    activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv')
    for j in range(5):
        activity = activity.drop(j, axis=0)
    activity = activity.drop([
        'PUBCHEM_ACTIVITY_URL', 'PUBCHEM_RESULT_TAG', 'PUBCHEM_ACTIVITY_SCORE',
        'PUBCHEM_SID', 'PUBCHEM_ASSAYDATA_COMMENT', 'Potency', 'Efficacy',
        'Analysis Comment', 'Curve_Description', 'Fit_LogAC50',
        'Fit_HillSlope', 'Fit_R2', 'Fit_InfiniteActivity', 'Fit_ZeroActivity',
        'Fit_CurveClass', 'Excluded_Points', 'Compound QC', 'Max_Response',
        'Activity at 0.457 uM', 'Activity at 2.290 uM', 'Activity at 11.40 uM',
        'Activity at 57.10 uM', 'PUBCHEM_ACTIVITY_OUTCOME'
    ],
                             axis=1)
    activity.rename(columns={'PUBCHEM_CID': 'CID'}, inplace=True)
    activity['dupes'] = activity.duplicated('CID')
    activity = activity[activity['dupes'] == 0].drop(['dupes'], axis=1)
    df_compounds = df_compounds.sort_values(by='CID')
    activity = activity.sort_values(by='CID')
    df = activity.merge(df_compounds)
    df = df.sort_values(by='CID')
    df = df.sample(frac=1).reset_index(drop=True)
    df_descriptor = utils.extract_all_descriptors(df, 'SMILES')
    df = df.join(df_descriptor)
    df.to_csv('data/descriptor_data.csv')

    # Type check inputs for sanity
    if df is None:
        raise ValueError('df is None')
    if not isinstance(df, pd.DataFrame):
        raise TypeError('df is not a dataframe')
    if TARGET_COLUMN is None:
        raise ValueError('target_column is None')
    if not isinstance(TARGET_COLUMN, basestring):
        raise TypeError('target_column is not a string')
    if TARGET_COLUMN not in df.columns:
        raise ValueError('target_column (%s) is not a valid column name' %
                         TARGET_COLUMN)

    # Train, validation and test split
    df_train, df_test = train_test_split(df, test_size=0.25)
    df_train, df_val = train_test_split(df_train, test_size=0.333333)
    x_train, x_val, x_test = df_train, df_val, df_test

    # Remove the classification column from the dataframe
    x_train = x_train.drop(TARGET_COLUMN, 1)
    x_val = x_val.drop(TARGET_COLUMN, 1)
    x_test = x_test.drop(TARGET_COLUMN, 1)
    y_train = pd.DataFrame(df_train[TARGET_COLUMN])
    y_val = pd.DataFrame(df_val[TARGET_COLUMN])
    y_test = pd.DataFrame(df_test[TARGET_COLUMN])

Example #4

0

Show file

File: Test_GMM.py Project: intel-iot-devkit/motor-defect-detector-cpp

    for i in range(no_of_bearings):
        bearing_list.append(i)

    testing_bearings = list(np.setdiff1d(bearing_list, training_bearings))

    # Load the model
    filename = "GMM_all.npy"
    model = np.load(filename).item()

    print("Testing ", dbname, "...")

    prediction_last_100 = []

    for k in testing_bearings:
        x = create_dataframe(frequency_component1, frequency_component2,
                             frequency_component3, frequency_component4,
                             frequency_component5, k)

        print("\nTesting bearing ", k + 1)
        label = model.predict(x)

        check_two = list(label[-100:]).count(2)
        ratio = (check_two / 100) * 100

        json_body = [{
            "measurement": "GMM",
            "tags": {
                "type": "B" + str(k + 1),
            },
            "fields": {
                "failure_ratio": ratio,

Example #5

0

Show file

File: Train_logistic_regression.py Project: intel-iot-devkit/motor-defect-detector-cpp

testset2_freq_dataframes = [
    'testset2_freq_comp1', 'testset2_freq_comp2', 'testset2_freq_comp3',
    'testset2_freq_comp4', 'testset2_freq_comp5'
]
testset2_length = int(len(testset2_temporary_df) / 5)
for i, c in enumerate(testset2_freq_dataframes, 0):
    exec(
        '{} = testset2_temporary_df.iloc[i*testset2_length:i*testset2_length+testset2_length, : ]'
        .format(c, i))

# Labelling the bearings which are failed
testset1_labelF = cal_Labels(testset1_length)
testset2_labelF = cal_Labels(testset2_length)

result1 = create_dataframe(testset1_freq_comp1, testset1_freq_comp2,
                           testset1_freq_comp3, testset1_freq_comp4,
                           testset1_freq_comp5, 7)
result1['labels'] = testset1_labelF

result2 = create_dataframe(testset2_freq_comp1, testset2_freq_comp2,
                           testset2_freq_comp3, testset2_freq_comp4,
                           testset2_freq_comp5, 0)
result2['labels'] = testset2_labelF

# Labelling the bearings which are passed
testset1_labelP = np.array([0] * 1800)
testset2_labelP = np.array([0] * 800)

result3 = create_dataframe(testset1_freq_comp1, testset1_freq_comp2,
                           testset1_freq_comp3, testset1_freq_comp4,
                           testset1_freq_comp5, 2)

Example #6

0

Show file

File: LogisticRegressionTraining.py Project: riddhikt/PredictiveMaintenance

except IOError:
    print(
        "you have entered either the wrong data directory path for either testset1 or testset2"
    )
testset1_freq_max1, testset1_freq_max2, testset1_freq_max3, testset1_freq_max4, testset1_freq_max5 = cal_max_freq(
    all_files_testset1, path_testset1)
testset2_freq_max1, testset2_freq_max2, testset2_freq_max3, testset2_freq_max4, testset2_freq_max5 = cal_max_freq(
    all_files_testset2, path_testset2)
# calculating the labels for the bearing which is failed
testset1_labelF = cal_Labels(len(all_files_testset1))
testset2_labelF = cal_Labels(len(all_files_testset2))

# creatine a datafRame for which bearing has failed
result1 = create_dataframe(testset1_freq_max1, testset1_freq_max2,
                           testset1_freq_max3, testset1_freq_max4,
                           testset1_freq_max5, 1)
result1['labels'] = testset1_labelF

result2 = create_dataframe(testset2_freq_max1, testset2_freq_max2,
                           testset2_freq_max3, testset2_freq_max4,
                           testset2_freq_max5, 0)
result2['labels'] = testset2_labelF

# calculating the labels for the testset1,testet2, which is not failed
testset1_labelP = np.array([0] * 1800)
testset2_labelP = np.array([0] * 800)

# creating a dataframe for which bearing is passed
result3 = create_dataframe(testset1_freq_max1, testset1_freq_max2,
                           testset1_freq_max3, testset1_freq_max4,

Example #7

0

Show file

File: histogram.py Project: ViktorMalesevic/Ecole42_logistic_regression

def histogram():
    data = ul.create_dataframe()
    plt.hist(data)

Example #8

0

Show file

File: driver.py Project: gitter-badger/USP-inhibition

def main():
    """
    Module to execute the entire neural network model from data retrieval to
    model performance metrics
    @:param: None
    :return: Post process results
    """
    # The SMILES and InChI logs of the same material have identical indices
    # Creating and joining the SMILES and InChI dataframes along the same index

    df_compounds_smiles = utils.create_dataframe('data/chemical_notation_data/'
                                                 'compounds_smiles.txt', 'smiles')
    df_compounds_inchi = utils.create_dataframe('data/chemical_notation_data/'
                                                'compounds_inchi.txt', 'inchi')

    df_compounds = pd.concat([df_compounds_smiles, df_compounds_inchi['INCHI']],
                             axis=1).rename(columns={'ID': 'CID'})

    activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv')
    for j in range(5):
        activity = activity.drop(j, axis=0)
    activity = activity.drop(['PUBCHEM_ACTIVITY_URL', 'PUBCHEM_RESULT_TAG',
                              'PUBCHEM_ACTIVITY_SCORE', 'PUBCHEM_SID',
                              'PUBCHEM_ASSAYDATA_COMMENT', 'Potency',
                              'Efficacy', 'Analysis Comment',
                              'Curve_Description', 'Fit_LogAC50',
                              'Fit_HillSlope', 'Fit_R2', 'Fit_InfiniteActivity',
                              'Fit_ZeroActivity', 'Fit_CurveClass',
                              'Excluded_Points', 'Compound QC', 'Max_Response',
                              'Activity at 0.457 uM', 'Activity at 2.290 uM',
                              'Activity at 11.40 uM', 'Activity at 57.10 uM',
                              'PUBCHEM_ACTIVITY_OUTCOME'], axis=1)
    activity.rename(columns={'PUBCHEM_CID': 'CID'}, inplace=True)
    activity['dupes'] = activity.duplicated('CID')
    activity = activity[activity['dupes'] == 0].drop(['dupes'], axis=1)
    df_compounds = df_compounds.sort_values(by='CID')
    activity = activity.sort_values(by='CID')
    df = activity.merge(df_compounds)
    df = df.sort_values(by='CID')
    df = df.sample(frac=1).reset_index(drop=True)
    df_descriptor = utils.extract_all_descriptors(df, 'SMILES')
    df = df.join(df_descriptor)
    df.to_csv('data/descriptor_data.csv')

    # Type check inputs for sanity
    if df is None:
        raise ValueError('df is None')
    if not isinstance(df, pd.DataFrame):
        raise TypeError('df is not a dataframe')
    if TARGET_COLUMN is None:
        raise ValueError('target_column is None')
    if not isinstance(TARGET_COLUMN, basestring):
        raise TypeError('target_column is not a string')
    if TARGET_COLUMN not in df.columns:
        raise ValueError('target_column (%s) is not a valid column name'
                         % TARGET_COLUMN)

    # Train, validation and test split
    df_train, df_test = train_test_split(df, test_size=0.25)
    df_train, df_val = train_test_split(df_train, test_size=0.333333)
    x_train, x_val, x_test = df_train, df_val, df_test

    # Remove the classification column from the dataframe
    x_train = x_train.drop(TARGET_COLUMN, 1)
    x_val = x_val.drop(TARGET_COLUMN, 1)
    x_test = x_test.drop(TARGET_COLUMN, 1)
    y_train = pd.DataFrame(df_train[TARGET_COLUMN])
    y_val = pd.DataFrame(df_val[TARGET_COLUMN])
    y_test = pd.DataFrame(df_test[TARGET_COLUMN])

Example #9

0

Show file

File: retraining.py Project: elBichon/midas_project.github.io

     74: "CDLTASUKIGAP",
     75: "CDLTHRUSTING",
     76: "CDLTRISTAR",
     77: "CDLUNIQUE3RIVER",
     78: "CDLUPSIDEGAP2CROWS",
     79: "CDLXSIDEGAP3METHODS",
     80: '12ema',
     81: '26ema',
     82: 'upper_band',
     83: 'lower_band',
     84: '%K',
     85: '%D',
     86: "label"
 }
 SQL = "SELECT * FROM master_record_staging"
 df_staging = utils.create_dataframe(MYDB, SQL, NAME_DICT, FEATURES_COLS)
 SQL = "SELECT * FROM master_record"
 df_master = utils.create_dataframe(MYDB, SQL, NAME_DICT, FEATURES_COLS)
 continuous_columns = [
     'volume', 'numberOfTrades', 'var_ema', 'var_bollinger', 'var_stoch',
     'RSI'
 ]
 categorical_columns = [
     'rsi_indicator', 'stoch_indicator', 'CDL2CROWS', 'CDL3BLACKCROWS',
     'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE', 'CDL3STARSINSOUTH',
     'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK',
     'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU',
     'CDLCONCEALBABYSWALL', 'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER',
     'CDLDOJI', 'CDLDOJISTAR', 'CDLDRAGONFLYDOJI', 'CDLENGULFING',
     'CDLEVENINGDOJISTAR', 'CDLEVENINGSTAR', 'CDLGAPSIDESIDEWHITE',
     'CDLGRAVESTONEDOJI', 'CDLHAMMER', 'CDLHANGINGMAN', 'CDLHARAMI',

Example #10

0

Show file

def main():
    """
    Module to execute the entire package from data retrieval to model
    performance metrics
    @:param: None
    :return: Post process results
    """
    # Importing inhibitor notation data
    # The SMILES and InChI logs of the same material have identical indices
    # Creating and joining the SMILES and InChI dataframes along the same index

    utils.check_files()
    df_compounds_smiles = utils.create_dataframe(
        'data/chemical_notation_'
        'data/compounds_smiles.txt', 'smiles')
    df_compounds_smiles.rename(columns={'ID': 'CID'}, inplace=True)
    df_compounds_smiles.sort_values(by='CID', inplace=True)

    # Importing inhibitor activity data
    activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv')
    activity = utils.clean_activity_dataframe(activity)

    # Merging activity data and compound notation data
    df = activity.merge(df_compounds_smiles)
    df.sort_values(by='CID', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Drop non-descriptor columns before feature space reduction
    df_target = df.drop(['SMILES', 'CID', 'Phenotype'], axis=1)

    # Extracting molecular descriptors for all compounds
    # print("Sending data for descriptor calculation")
    # utils.extract_all_descriptors(df, 'SMILES')

    # Importing feature sets
    df_charge = pd.DataFrame.from_csv('data/df_charge.csv')
    df_basak = pd.DataFrame.from_csv('data/df_basak.csv')
    df_con = pd.DataFrame.from_csv('data/df_con.csv')
    df_estate = pd.DataFrame.from_csv('data/df_estate.csv')
    df_constitution = pd.DataFrame.from_csv('data/df_constitution.csv')
    df_property = pd.DataFrame.from_csv('data/df_property.csv')
    df_kappa = pd.DataFrame.from_csv('data/df_kappa.csv')
    df_moe = pd.DataFrame.from_csv('data/df_moe.csv')

    print("Joining dataframes")
    df_descriptor = df_kappa.join(df_moe).join(df_constitution).\
        join(df_property).join(df_charge).join(df_estate).join(df_con).join(
        df_basak)
    print("Joining dataframes done")

    print("Checking dataframe for NaN, infinite or too large values")
    df_descriptor = utils.remove_nan_infinite(df_descriptor)

    # Transform all column values to mean 0 and unit variance
    print("Transforming dataframe using mean and variance")
    df_descriptor = utils.transform_dataframe(df_descriptor)
    print("Transforming dataframe using mean and variance done")

    # Feature selection and space reduction
    print("Selecting best features in dataframe")
    df_features = utils.select_features(df_descriptor, df_target)
    print("Selecting best features in dataframe done")

    df = df_features.join(df_target)

    # Data to training task
    # Type check inputs for sanity
    if df is None:
        raise ValueError('df is None')
    if not isinstance(df, pd.DataFrame):
        raise TypeError('df is not a dataframe')
    if TARGET_COLUMN is None:
        raise ValueError('target_column is None')
    if not isinstance(TARGET_COLUMN, basestring):
        raise TypeError('target_column is not a string')
    if TARGET_COLUMN not in df.columns:
        raise ValueError('target_column (%s) is not a valid column name' %
                         TARGET_COLUMN)

    # Train, validation and test split
    df_train, df_test = sklearn.cross_validation.train_test_split(
        df, test_size=0.25)

    # Remove the classification column from the dataframe
    x_train = df_train.drop(TARGET_COLUMN, 1)
    x_test = df_test.drop(TARGET_COLUMN, 1)
    y_train = pd.DataFrame(df_train[TARGET_COLUMN])
    y_test = pd.DataFrame(df_test[TARGET_COLUMN])

    with open(XY_PICKLE, 'wb') as results:
        pickle.dump(x_train, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(x_test, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_train, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_test, results, pickle.HIGHEST_PROTOCOL)

    models.run_models(x_train, y_train, x_test, y_test)

    post_process.results()