コード例 #1
0
 def getValidationDataset(self):
     print "Reading the valid pairs"
     valid = data_io.read_valid_pairs()
     valid2 = data_io.read_valid_info()
     valid["A type"] = valid2["A type"]
     valid["B type"] = valid2["B type"]
     return valid
コード例 #2
0
ファイル: newfeat.py プロジェクト: sibelius/CauseEffect
def ext():
    # Read the pairs
    print "Read Pairs"
    print "Read Train"
    train = d.read_train_pairs()
    print "Read Valid"
    valid = d.read_valid_pairs()
    print "Read Sup1"
    sup1 = d.read_sup1_train_pairs()
    print "Read Sup2"
    sup2 = d.read_sup2_train_pairs()
    print "Read Sup3"
    sup3 = d.read_sup3_train_pairs()

    # Get the feature extractor
    combined = feat.feature_extractor()

    # Extract the features
    print 'Extract the features'
    print "Extract Train"
    train_att = combined.fit_transform(train)
    print "Extract Valid"
    valid_att = combined.fit_transform(valid)
    print "Extract Sup1"
    sup1_att = combined.fit_transform(sup1)
    print "Extract Sup2"
    sup2_att = combined.fit_transform(sup2)
    print "Extract Sup3"
    sup3_att = combined.fit_transform(sup3)

    print "Join"
    total_new_att = np.vstack((train_att, valid_att, sup1_att, sup2_att, sup3_att))

    # Save extracted data
    np.save('total_new_att.npy', total_new_att)
コード例 #3
0
ファイル: newfeat1.py プロジェクト: sibelius/CauseEffect
def extrair_tudo():
    combined = new_features1()

    print "Train"
    train = d.read_train_pairs()
    train_att = combined.fit_transform(train)
    np.save(train_att, open("train_att.npy", "wb"))


    print "Train1"
    valid = d.read_valid_pairs()
    valid_att = combined.fit_transform(valid)
    np.save(valid_att, open("valid_att.npy", "wb"))

    print "Train2"
    sup1 = d.read_sup1_train_pairs()
    sup1_att = combined.fit_transform(sup1)
    np.save(sup1_att, open("sup1_att.npy", "wb"))

    print "Train3"
    sup2 = d.read_sup2_train_pairs()
    sup2_att = combined.fit_transform(sup2)
    np.save(sup1_att, open("sup2_att.npy", "wb"))

    print "Train4"
    sup3 = d.read_sup3_train_pairs()
    sup3_att = combined.fit_transform(sup3)
    np.save(sup1_att, open("sup3_att.npy", "wb"))
コード例 #4
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB),
                ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]

    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)

    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)

    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)

    print("Writing feature file")
    data_io.write_real_features('add_noise', all_features, feature_names)
コード例 #5
0
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
コード例 #6
0
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
コード例 #7
0
def main():
    print("Reading the valid pairs")
    valid = data_io.read_valid_pairs()

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
コード例 #8
0
ファイル: predict.py プロジェクト: kespindler/causeeffect
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
コード例 #9
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('A: Normalized Entropy', 'A', f.normalized_entropy),
                ('B: Normalized Entropy', 'B', f.normalized_entropy),
                ('Pearson R', ['A','B'], f.correlation),
                ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation
                ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'),
                ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Spearman rank correlation', ['A','B'], f.rcorrelation),
                ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'),
                ('Kurtosis A', 'A', f.fkurtosis),
                ('Kurtosis B', 'B', f.fkurtosis),
                ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'),
                ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Unique ratio A', 'A', f.unique_ratio),
                ('Unique ratio B', 'B', f.unique_ratio),
                ('Skew A', 'A', f.fskew),
                ('Skew B', 'B', f.fskew),
                ('Skew difference', 'derived', 'output[key][14] - output[key][15]'),
                ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'),
                ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'),
                ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'),
                ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('reasonable_features', all_features, feature_names)
コード例 #10
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Moment 5 A', 'A', f.standard_moment_5),
                ('Moment 5 B', 'B', f.standard_moment_5),
                ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'),
                ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Moment 6 A', 'A', f.standard_moment_6),
                ('Moment 6 B', 'B', f.standard_moment_6),
                ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'),
                ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'),
                ('Moment 7 A', 'A', f.standard_moment_7),
                ('Moment 7 B', 'B', f.standard_moment_7),
                ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'),
                ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Moment 8 A', 'A', f.standard_moment_8),
                ('Moment 8 B', 'B', f.standard_moment_8),
                ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'),
                ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'),
                ('Moment 9 A', 'A', f.standard_moment_9),
                ('Moment 9 B', 'B', f.standard_moment_9),
                ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'),
                ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('high_order_moments', all_features, feature_names)
コード例 #11
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    valid_info = data_io.read_valid_info()
    valid = pd.concat([valid, valid_info],axis =1) 
    valid = train.get_types(valid)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions, fn)
コード例 #12
0
def main():
    print("Reading the valid pairs")
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_valid_features(trans_valid)

    print("Loading the classifier")
    #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model()
    classifier = data_io.load_model()

    print("Making predictions")
    valid_info = data_io.read_valid_info()
    predictions = list()
    curr_pred = None
    """
    for i in range(len(trans_valid)):
      
      if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = both_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical":
        curr_pred = A_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = B_classifier.predict_proba(trans_valid[i, :])
     
      else:
        curr_pred = none_classifier.predict_proba(trans_valid[i, :])

      predictions.append(curr_pred[0][2] - curr_pred[0][0])
    """

    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
コード例 #13
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_valid_features(trans_valid)

    print("Loading the classifier")
    #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model()
    classifier = data_io.load_model()

    print("Making predictions")
    valid_info = data_io.read_valid_info() 
    predictions = list()
    curr_pred = None
    """
    for i in range(len(trans_valid)):
      
      if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = both_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical":
        curr_pred = A_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = B_classifier.predict_proba(trans_valid[i, :])
     
      else:
        curr_pred = none_classifier.predict_proba(trans_valid[i, :])

      predictions.append(curr_pred[0][2] - curr_pred[0][0])
    """

    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
コード例 #14
0
ファイル: generate_icgi.py プロジェクト: evoup/cause-effect
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB),
                ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA),
                ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'),
                ('ICGI slope AB', ['A','B'], f.icgi_slope_AB),
                ('ICGI slope BA', ['A','B'], f.icgi_slope_BA),
                ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#,
                #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT),
                #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT),
                #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'),
                #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT),
                #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT),
                #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('icgi', all_features, feature_names)
コード例 #15
0
ファイル: predict.py プロジェクト: sjuvekar/CauseEffectPairs
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_features(trans_valid)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
コード例 #16
0
def main():

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return
        
    features = [('Number of Samples', 'A', len),
                ('Max A', 'A', max),
                ('Max B', 'B', max),
                ('Min A', 'A', min),
                ('Min B', 'B', min),
                ('Mean A', 'A', f.mean),
                ('Mean B', 'B', f.mean),
                ('Median A', 'A', f.median),
                ('Median B', 'B', f.median),
                ('Sd A', 'A', f.sd),
                ('Sd B', 'B', f.sd)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('unreasonable_features', all_features, feature_names)
コード例 #17
0
def main():
    extractor = feature_extractor()
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = extractor.fit_transform(train[:])
    
    print("Reading in the ensemble training data")
    ensemble_train = data_io.read_ensemble_train_pairs()

    print("Extracting features from ensemble training data")
    ensemble_train_features = extractor.fit_transform(ensemble_train[:])
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = extractor.fit_transform(valid[:])
    
    all_features = np.concatenate((train_features, ensemble_train_features, valid_features))
    
    print("Concatenating names")
    train_names = [train.irow(i).name for i in range(len(train))]
    ensemble_train_names = [ensemble_train.irow(i).name for i in range(len(ensemble_train))]
    valid_names = [valid.irow(i).name for i in range(len(valid))]
    all_names = train_names + ensemble_train_names + valid_names
    
    print("Writing feature file")
    feature_names = ['Number of Samples',
                     'A: Number of Unique Samples',
                     'B: Number of Unique Samples',
                     'A: Normalized Entropy',
                     'B: Normalized Entropy',
                     'Pearson R',
                     'Pearson R Magnitude',
                     'Entropy Difference']
    data_io.write_real_features('benchmark_features', all_names, all_features, feature_names)
コード例 #18
0
ファイル: generate_corrs.py プロジェクト: evoup/cause-effect
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Kendall tau', ['A','B'], f.kendall),
                ('Kendall tau p', ['A','B'], f.kendall_p),
                ('Mann Whitney', ['A','B'], f.mannwhitney),
                ('Mann Whitney p', ['A','B'], f.mannwhitney_p),
                #('Wilcoxon', ['A','B'], f.wilcoxon),
                #('Wilcoxon p', ['A','B'], f.wilcoxon_p),
                ('Kruskal', ['A','B'], f.kruskal),
                ('Kruskal p', ['A','B'], f.kruskal_p),
                ]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('corrs', all_features, feature_names)
コード例 #19
0
ファイル: fe.py プロジェクト: diogo149/causality
def extract_valid_features():
    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])


    print("Reading the valid pairs")
    X = data_io.read_valid_pairs()

    print("Extracting features")
    # well, no fit data, so y = None
    extracted = features.fit_transform(X,y = None,type_map = data_io.read_valid_info())


    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed/60.0) + " Minutes")

    print ("Saving features")
    X = pd.DataFrame(extracted, index = X.index)
    X.columns = header
    data_io.save_valid_features(X)
コード例 #20
0
ファイル: fe.py プロジェクト: yezhou-huang/causality
def extract_valid_features():
    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading the valid pairs")
    X = data_io.read_valid_pairs()

    print("Extracting features")
    # well, no fit data, so y = None
    extracted = features.fit_transform(X,
                                       y=None,
                                       type_map=data_io.read_valid_info())

    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed / 60.0) + " Minutes")

    print("Saving features")
    X = pd.DataFrame(extracted, index=X.index)
    X.columns = header
    data_io.save_valid_features(X)
コード例 #21
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Injectivity 10', ['A','B'], f.injectivity_10),
                ('Injectivity 15', ['A','B'], f.injectivity_15),
                ('Injectivity 20', ['A','B'], f.injectivity_20),
                ('Injectivity 25', ['A','B'], f.injectivity_25),
                ('Injectivity 30', ['A','B'], f.injectivity_30),
                ('Injectivity 35', ['A','B'], f.injectivity_35),
                ('Injectivity 40', ['A','B'], f.injectivity_40)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('injectivity', all_features, feature_names)
コード例 #22
0
ファイル: create_mat.py プロジェクト: kespindler/causeeffect
    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == "__main__":

    print "Reading in {} data...".format(DATA)

    if DATA == "train":
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == "valid":
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print "Saving coded info matrix..."
    codes = np.zeros(info.values.shape)
    lookup = {"Numerical": 1, "Categorical": 2, "Binary": 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat("matlab/{}info.mat".format(DATA), {"codes": codes}, oned_as="column")

    print "Saving value matrices..."
    for i, t in enumerate(train.values):
        A, B = t
コード例 #23
0
    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == '__main__':

    print 'Reading in {} data...'.format(DATA)

    if DATA == 'train':
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == 'valid':
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print 'Saving coded info matrix...'
    codes = np.zeros(info.values.shape)
    lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat('matlab/{}info.mat'.format(DATA), {'codes': codes},
            oned_as='column')

    print 'Saving value matrices...'
    for i, t in enumerate(train.values):