Beispiel #1
0
 def getValidationDataset(self):
     print "Reading the valid pairs"
     valid = data_io.read_valid_pairs()
     valid2 = data_io.read_valid_info()
     valid["A type"] = valid2["A type"]
     valid["B type"] = valid2["B type"]
     return valid
Beispiel #2
0
def ext():
    # Read the pairs
    print "Read Pairs"
    print "Read Train"
    train = d.read_train_pairs()
    print "Read Valid"
    valid = d.read_valid_pairs()
    print "Read Sup1"
    sup1 = d.read_sup1_train_pairs()
    print "Read Sup2"
    sup2 = d.read_sup2_train_pairs()
    print "Read Sup3"
    sup3 = d.read_sup3_train_pairs()

    # Get the feature extractor
    combined = feat.feature_extractor()

    # Extract the features
    print 'Extract the features'
    print "Extract Train"
    train_att = combined.fit_transform(train)
    print "Extract Valid"
    valid_att = combined.fit_transform(valid)
    print "Extract Sup1"
    sup1_att = combined.fit_transform(sup1)
    print "Extract Sup2"
    sup2_att = combined.fit_transform(sup2)
    print "Extract Sup3"
    sup3_att = combined.fit_transform(sup3)

    print "Join"
    total_new_att = np.vstack((train_att, valid_att, sup1_att, sup2_att, sup3_att))

    # Save extracted data
    np.save('total_new_att.npy', total_new_att)
Beispiel #3
0
def extrair_tudo():
    combined = new_features1()

    print "Train"
    train = d.read_train_pairs()
    train_att = combined.fit_transform(train)
    np.save(train_att, open("train_att.npy", "wb"))


    print "Train1"
    valid = d.read_valid_pairs()
    valid_att = combined.fit_transform(valid)
    np.save(valid_att, open("valid_att.npy", "wb"))

    print "Train2"
    sup1 = d.read_sup1_train_pairs()
    sup1_att = combined.fit_transform(sup1)
    np.save(sup1_att, open("sup1_att.npy", "wb"))

    print "Train3"
    sup2 = d.read_sup2_train_pairs()
    sup2_att = combined.fit_transform(sup2)
    np.save(sup1_att, open("sup2_att.npy", "wb"))

    print "Train4"
    sup3 = d.read_sup3_train_pairs()
    sup3_att = combined.fit_transform(sup3)
    np.save(sup1_att, open("sup3_att.npy", "wb"))
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB),
                ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]

    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)

    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)

    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)

    print("Writing feature file")
    data_io.write_real_features('add_noise', all_features, feature_names)
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
def main():
    print("Reading the valid pairs")
    valid = data_io.read_valid_pairs()

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Beispiel #8
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('A: Normalized Entropy', 'A', f.normalized_entropy),
                ('B: Normalized Entropy', 'B', f.normalized_entropy),
                ('Pearson R', ['A','B'], f.correlation),
                ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation
                ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'),
                ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Spearman rank correlation', ['A','B'], f.rcorrelation),
                ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'),
                ('Kurtosis A', 'A', f.fkurtosis),
                ('Kurtosis B', 'B', f.fkurtosis),
                ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'),
                ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Unique ratio A', 'A', f.unique_ratio),
                ('Unique ratio B', 'B', f.unique_ratio),
                ('Skew A', 'A', f.fskew),
                ('Skew B', 'B', f.fskew),
                ('Skew difference', 'derived', 'output[key][14] - output[key][15]'),
                ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'),
                ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'),
                ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'),
                ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Moment 5 A', 'A', f.standard_moment_5),
                ('Moment 5 B', 'B', f.standard_moment_5),
                ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'),
                ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Moment 6 A', 'A', f.standard_moment_6),
                ('Moment 6 B', 'B', f.standard_moment_6),
                ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'),
                ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'),
                ('Moment 7 A', 'A', f.standard_moment_7),
                ('Moment 7 B', 'B', f.standard_moment_7),
                ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'),
                ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Moment 8 A', 'A', f.standard_moment_8),
                ('Moment 8 B', 'B', f.standard_moment_8),
                ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'),
                ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'),
                ('Moment 9 A', 'A', f.standard_moment_9),
                ('Moment 9 B', 'B', f.standard_moment_9),
                ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'),
                ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('high_order_moments', all_features, feature_names)
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    valid_info = data_io.read_valid_info()
    valid = pd.concat([valid, valid_info],axis =1) 
    valid = train.get_types(valid)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions, fn)
Beispiel #12
0
def main():
    print("Reading the valid pairs")
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_valid_features(trans_valid)

    print("Loading the classifier")
    #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model()
    classifier = data_io.load_model()

    print("Making predictions")
    valid_info = data_io.read_valid_info()
    predictions = list()
    curr_pred = None
    """
    for i in range(len(trans_valid)):
      
      if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = both_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical":
        curr_pred = A_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = B_classifier.predict_proba(trans_valid[i, :])
     
      else:
        curr_pred = none_classifier.predict_proba(trans_valid[i, :])

      predictions.append(curr_pred[0][2] - curr_pred[0][0])
    """

    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Beispiel #13
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_valid_features(trans_valid)

    print("Loading the classifier")
    #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model()
    classifier = data_io.load_model()

    print("Making predictions")
    valid_info = data_io.read_valid_info() 
    predictions = list()
    curr_pred = None
    """
    for i in range(len(trans_valid)):
      
      if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = both_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical":
        curr_pred = A_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = B_classifier.predict_proba(trans_valid[i, :])
     
      else:
        curr_pred = none_classifier.predict_proba(trans_valid[i, :])

      predictions.append(curr_pred[0][2] - curr_pred[0][0])
    """

    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Beispiel #14
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB),
                ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA),
                ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'),
                ('ICGI slope AB', ['A','B'], f.icgi_slope_AB),
                ('ICGI slope BA', ['A','B'], f.icgi_slope_BA),
                ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#,
                #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT),
                #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT),
                #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'),
                #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT),
                #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT),
                #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('icgi', all_features, feature_names)
Beispiel #15
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_features(trans_valid)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return
        
    features = [('Number of Samples', 'A', len),
                ('Max A', 'A', max),
                ('Max B', 'B', max),
                ('Min A', 'A', min),
                ('Min B', 'B', min),
                ('Mean A', 'A', f.mean),
                ('Mean B', 'B', f.mean),
                ('Median A', 'A', f.median),
                ('Median B', 'B', f.median),
                ('Sd A', 'A', f.sd),
                ('Sd B', 'B', f.sd)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('unreasonable_features', all_features, feature_names)
def main():
    extractor = feature_extractor()
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = extractor.fit_transform(train[:])
    
    print("Reading in the ensemble training data")
    ensemble_train = data_io.read_ensemble_train_pairs()

    print("Extracting features from ensemble training data")
    ensemble_train_features = extractor.fit_transform(ensemble_train[:])
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = extractor.fit_transform(valid[:])
    
    all_features = np.concatenate((train_features, ensemble_train_features, valid_features))
    
    print("Concatenating names")
    train_names = [train.irow(i).name for i in range(len(train))]
    ensemble_train_names = [ensemble_train.irow(i).name for i in range(len(ensemble_train))]
    valid_names = [valid.irow(i).name for i in range(len(valid))]
    all_names = train_names + ensemble_train_names + valid_names
    
    print("Writing feature file")
    feature_names = ['Number of Samples',
                     'A: Number of Unique Samples',
                     'B: Number of Unique Samples',
                     'A: Normalized Entropy',
                     'B: Normalized Entropy',
                     'Pearson R',
                     'Pearson R Magnitude',
                     'Entropy Difference']
    data_io.write_real_features('benchmark_features', all_names, all_features, feature_names)
Beispiel #18
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Kendall tau', ['A','B'], f.kendall),
                ('Kendall tau p', ['A','B'], f.kendall_p),
                ('Mann Whitney', ['A','B'], f.mannwhitney),
                ('Mann Whitney p', ['A','B'], f.mannwhitney_p),
                #('Wilcoxon', ['A','B'], f.wilcoxon),
                #('Wilcoxon p', ['A','B'], f.wilcoxon_p),
                ('Kruskal', ['A','B'], f.kruskal),
                ('Kruskal p', ['A','B'], f.kruskal_p),
                ]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('corrs', all_features, feature_names)
Beispiel #19
0
def extract_valid_features():
    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])


    print("Reading the valid pairs")
    X = data_io.read_valid_pairs()

    print("Extracting features")
    # well, no fit data, so y = None
    extracted = features.fit_transform(X,y = None,type_map = data_io.read_valid_info())


    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed/60.0) + " Minutes")

    print ("Saving features")
    X = pd.DataFrame(extracted, index = X.index)
    X.columns = header
    data_io.save_valid_features(X)
Beispiel #20
0
def extract_valid_features():
    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading the valid pairs")
    X = data_io.read_valid_pairs()

    print("Extracting features")
    # well, no fit data, so y = None
    extracted = features.fit_transform(X,
                                       y=None,
                                       type_map=data_io.read_valid_info())

    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed / 60.0) + " Minutes")

    print("Saving features")
    X = pd.DataFrame(extracted, index=X.index)
    X.columns = header
    data_io.save_valid_features(X)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Injectivity 10', ['A','B'], f.injectivity_10),
                ('Injectivity 15', ['A','B'], f.injectivity_15),
                ('Injectivity 20', ['A','B'], f.injectivity_20),
                ('Injectivity 25', ['A','B'], f.injectivity_25),
                ('Injectivity 30', ['A','B'], f.injectivity_30),
                ('Injectivity 35', ['A','B'], f.injectivity_35),
                ('Injectivity 40', ['A','B'], f.injectivity_40)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('injectivity', all_features, feature_names)
Beispiel #22
0
    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == "__main__":

    print "Reading in {} data...".format(DATA)

    if DATA == "train":
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == "valid":
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print "Saving coded info matrix..."
    codes = np.zeros(info.values.shape)
    lookup = {"Numerical": 1, "Categorical": 2, "Binary": 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat("matlab/{}info.mat".format(DATA), {"codes": codes}, oned_as="column")

    print "Saving value matrices..."
    for i, t in enumerate(train.values):
        A, B = t
Beispiel #23
0
    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == '__main__':

    print 'Reading in {} data...'.format(DATA)

    if DATA == 'train':
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == 'valid':
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print 'Saving coded info matrix...'
    codes = np.zeros(info.values.shape)
    lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat('matlab/{}info.mat'.format(DATA), {'codes': codes},
            oned_as='column')

    print 'Saving value matrices...'
    for i, t in enumerate(train.values):