def getValidationDataset(self): print "Reading the valid pairs" valid = data_io.read_valid_pairs() valid2 = data_io.read_valid_info() valid["A type"] = valid2["A type"] valid["B type"] = valid2["B type"] return valid
def ext(): # Read the pairs print "Read Pairs" print "Read Train" train = d.read_train_pairs() print "Read Valid" valid = d.read_valid_pairs() print "Read Sup1" sup1 = d.read_sup1_train_pairs() print "Read Sup2" sup2 = d.read_sup2_train_pairs() print "Read Sup3" sup3 = d.read_sup3_train_pairs() # Get the feature extractor combined = feat.feature_extractor() # Extract the features print 'Extract the features' print "Extract Train" train_att = combined.fit_transform(train) print "Extract Valid" valid_att = combined.fit_transform(valid) print "Extract Sup1" sup1_att = combined.fit_transform(sup1) print "Extract Sup2" sup2_att = combined.fit_transform(sup2) print "Extract Sup3" sup3_att = combined.fit_transform(sup3) print "Join" total_new_att = np.vstack((train_att, valid_att, sup1_att, sup2_att, sup3_att)) # Save extracted data np.save('total_new_att.npy', total_new_att)
def extrair_tudo(): combined = new_features1() print "Train" train = d.read_train_pairs() train_att = combined.fit_transform(train) np.save(train_att, open("train_att.npy", "wb")) print "Train1" valid = d.read_valid_pairs() valid_att = combined.fit_transform(valid) np.save(valid_att, open("valid_att.npy", "wb")) print "Train2" sup1 = d.read_sup1_train_pairs() sup1_att = combined.fit_transform(sup1) np.save(sup1_att, open("sup1_att.npy", "wb")) print "Train3" sup2 = d.read_sup2_train_pairs() sup2_att = combined.fit_transform(sup2) np.save(sup1_att, open("sup2_att.npy", "wb")) print "Train4" sup3 = d.read_sup3_train_pairs() sup3_att = combined.fit_transform(sup3) np.save(sup1_att, open("sup3_att.npy", "wb"))
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')): print 'Feature file already exists - not overwriting' return features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB), ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('add_noise', all_features, feature_names)
def getDataset(self): if self.getTrain: readData = data_io.read_train_pairs() readData2 = data_io.read_train_info() else: readData = data_io.read_valid_pairs() readData2 = data_io.read_valid_info() readData["A type"] = readData2["A type"] readData["B type"] = readData2["B type"] return readData
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict(valid) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')): print 'Feature file already exists - not overwriting' return features = [('A: Normalized Entropy', 'A', f.normalized_entropy), ('B: Normalized Entropy', 'B', f.normalized_entropy), ('Pearson R', ['A','B'], f.correlation), ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'), ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'), ('Spearman rank correlation', ['A','B'], f.rcorrelation), ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'), ('Kurtosis A', 'A', f.fkurtosis), ('Kurtosis B', 'B', f.fkurtosis), ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'), ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'), ('Unique ratio A', 'A', f.unique_ratio), ('Unique ratio B', 'B', f.unique_ratio), ('Skew A', 'A', f.fskew), ('Skew B', 'B', f.fskew), ('Skew difference', 'derived', 'output[key][14] - output[key][15]'), ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'), ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'), ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'), ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')): print 'Feature file already exists - not overwriting' return features = [('Moment 5 A', 'A', f.standard_moment_5), ('Moment 5 B', 'B', f.standard_moment_5), ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'), ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'), ('Moment 6 A', 'A', f.standard_moment_6), ('Moment 6 B', 'B', f.standard_moment_6), ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'), ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'), ('Moment 7 A', 'A', f.standard_moment_7), ('Moment 7 B', 'B', f.standard_moment_7), ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'), ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'), ('Moment 8 A', 'A', f.standard_moment_8), ('Moment 8 B', 'B', f.standard_moment_8), ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'), ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'), ('Moment 9 A', 'A', f.standard_moment_9), ('Moment 9 B', 'B', f.standard_moment_9), ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'), ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('high_order_moments', all_features, feature_names)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() valid_info = data_io.read_valid_info() valid = pd.concat([valid, valid_info],axis =1) valid = train.get_types(valid) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict(valid) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions, fn)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() features = fe.feature_extractor() print("Transforming features") trans_valid = features.fit_transform(valid) trans_valid = np.nan_to_num(trans_valid) print("Saving Valid Features") data_io.save_valid_features(trans_valid) print("Loading the classifier") #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model() classifier = data_io.load_model() print("Making predictions") valid_info = data_io.read_valid_info() predictions = list() curr_pred = None """ for i in range(len(trans_valid)): if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical": curr_pred = both_classifier.predict_proba(trans_valid[i, :]) elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical": curr_pred = A_classifier.predict_proba(trans_valid[i, :]) elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical": curr_pred = B_classifier.predict_proba(trans_valid[i, :]) else: curr_pred = none_classifier.predict_proba(trans_valid[i, :]) predictions.append(curr_pred[0][2] - curr_pred[0][0]) """ orig_predictions = classifier.predict_proba(trans_valid) predictions = orig_predictions[:, 2] - orig_predictions[:, 0] predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')): print 'Feature file already exists - not overwriting' return features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB), ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA), ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'), ('ICGI slope AB', ['A','B'], f.icgi_slope_AB), ('ICGI slope BA', ['A','B'], f.icgi_slope_BA), ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#, #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT), #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT), #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'), #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT), #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT), #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('icgi', all_features, feature_names)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() features = fe.feature_extractor() print("Transforming features") trans_valid = features.fit_transform(valid) trans_valid = np.nan_to_num(trans_valid) print("Saving Valid Features") data_io.save_features(trans_valid) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") orig_predictions = classifier.predict_proba(trans_valid) predictions = orig_predictions[:, 2] - orig_predictions[:, 0] predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')): print 'Feature file already exists - not overwriting' return features = [('Number of Samples', 'A', len), ('Max A', 'A', max), ('Max B', 'B', max), ('Min A', 'A', min), ('Min B', 'B', min), ('Mean A', 'A', f.mean), ('Mean B', 'B', f.mean), ('Median A', 'A', f.median), ('Median B', 'B', f.median), ('Sd A', 'A', f.sd), ('Sd B', 'B', f.sd)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('unreasonable_features', all_features, feature_names)
def main(): extractor = feature_extractor() print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = extractor.fit_transform(train[:]) print("Reading in the ensemble training data") ensemble_train = data_io.read_ensemble_train_pairs() print("Extracting features from ensemble training data") ensemble_train_features = extractor.fit_transform(ensemble_train[:]) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = extractor.fit_transform(valid[:]) all_features = np.concatenate((train_features, ensemble_train_features, valid_features)) print("Concatenating names") train_names = [train.irow(i).name for i in range(len(train))] ensemble_train_names = [ensemble_train.irow(i).name for i in range(len(ensemble_train))] valid_names = [valid.irow(i).name for i in range(len(valid))] all_names = train_names + ensemble_train_names + valid_names print("Writing feature file") feature_names = ['Number of Samples', 'A: Number of Unique Samples', 'B: Number of Unique Samples', 'A: Normalized Entropy', 'B: Normalized Entropy', 'Pearson R', 'Pearson R Magnitude', 'Entropy Difference'] data_io.write_real_features('benchmark_features', all_names, all_features, feature_names)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')): print 'Feature file already exists - not overwriting' return features = [('Kendall tau', ['A','B'], f.kendall), ('Kendall tau p', ['A','B'], f.kendall_p), ('Mann Whitney', ['A','B'], f.mannwhitney), ('Mann Whitney p', ['A','B'], f.mannwhitney_p), #('Wilcoxon', ['A','B'], f.wilcoxon), #('Wilcoxon p', ['A','B'], f.wilcoxon_p), ('Kruskal', ['A','B'], f.kruskal), ('Kruskal p', ['A','B'], f.kruskal_p), ] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('corrs', all_features, feature_names)
def extract_valid_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading the valid pairs") X = data_io.read_valid_pairs() print("Extracting features") # well, no fit data, so y = None extracted = features.fit_transform(X,y = None,type_map = data_io.read_valid_info()) elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed/60.0) + " Minutes") print ("Saving features") X = pd.DataFrame(extracted, index = X.index) X.columns = header data_io.save_valid_features(X)
def extract_valid_features(): start = time.time() features = feature_extractor() header = [] for h in features.features: header.append(h[0]) print("Reading the valid pairs") X = data_io.read_valid_pairs() print("Extracting features") # well, no fit data, so y = None extracted = features.fit_transform(X, y=None, type_map=data_io.read_valid_info()) elapsed = float(time.time() - start) print("Features extracted in " + str(elapsed / 60.0) + " Minutes") print("Saving features") X = pd.DataFrame(extracted, index=X.index) X.columns = header data_io.save_valid_features(X)
def main(overwrite=False): #### TODO - sequential processing of data would significantly reduce memory demands if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')): print 'Feature file already exists - not overwriting' return features = [('Injectivity 10', ['A','B'], f.injectivity_10), ('Injectivity 15', ['A','B'], f.injectivity_15), ('Injectivity 20', ['A','B'], f.injectivity_20), ('Injectivity 25', ['A','B'], f.injectivity_25), ('Injectivity 30', ['A','B'], f.injectivity_30), ('Injectivity 35', ['A','B'], f.injectivity_35), ('Injectivity 40', ['A','B'], f.injectivity_40)] feature_names = [name for (name, dummy1, dummy2) in features] print("Reading in the training data") train = data_io.read_train_pairs() print("Extracting features from training data") train_features = f.apply_features(train, features) print("Reading in the validation data") valid = data_io.read_valid_pairs() print("Extracting features from validation data") valid_features = f.apply_features(valid, features) # Concatenate features all_features = train_features all_features.update(valid_features) print("Writing feature file") data_io.write_real_features('injectivity', all_features, feature_names)
jointp = np.outer(proba_nz, probb_nz) hpos = np.sum(np.log(jointp) * jointp) return -hpos if __name__ == "__main__": print "Reading in {} data...".format(DATA) if DATA == "train": info = data_io.read_train_info() train = data_io.read_train_pairs() elif DATA == "valid": info = data_io.read_valid_info() train = data_io.read_valid_pairs() else: raise ValueError print "Saving coded info matrix..." codes = np.zeros(info.values.shape) lookup = {"Numerical": 1, "Categorical": 2, "Binary": 3} for i, t in enumerate(info.values): a, b = t codes[i, :] = [lookup[a], lookup[b]] savemat("matlab/{}info.mat".format(DATA), {"codes": codes}, oned_as="column") print "Saving value matrices..." for i, t in enumerate(train.values): A, B = t
jointp = np.outer(proba_nz, probb_nz) hpos = np.sum(np.log(jointp) * jointp) return -hpos if __name__ == '__main__': print 'Reading in {} data...'.format(DATA) if DATA == 'train': info = data_io.read_train_info() train = data_io.read_train_pairs() elif DATA == 'valid': info = data_io.read_valid_info() train = data_io.read_valid_pairs() else: raise ValueError print 'Saving coded info matrix...' codes = np.zeros(info.values.shape) lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3} for i, t in enumerate(info.values): a, b = t codes[i, :] = [lookup[a], lookup[b]] savemat('matlab/{}info.mat'.format(DATA), {'codes': codes}, oned_as='column') print 'Saving value matrices...' for i, t in enumerate(train.values):