Beispiel #1
0
def calcMetr(vn_df, msg=' '):
    tst = train.TrainTest('x',
                      '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt',
                      ['status'],
                      ['descr'])
    vn_d = vn_df[~vn_df.var_id.duplicated()]
    tst.pred_y = numpy.array(vn_d['pred_labels'].astype(int))
    tst.test_set_y = tst.pred_y * 0
    tst.test_set_y[numpy.array(vn_d.status.isin(['Y']))] = 1
    print '\n\n HHHHHHHAAAAAAA'
    print pandas.Series(tst.test_set_y).value_counts()
    print msg
    tst.getMetrics()
Beispiel #2
0
def calcMetr(vn_df, msg=' '):
    tst = train.TrainTest('x',
                      '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt',
                      ['status'],
                      ['descr'])
    if vn_df.empty:
        return None
    vn_d = vn_df[~vn_df.var_id.duplicated()]
    tst.pred_y = numpy.array(vn_d['pred_labels'].astype(int))
    tst.test_set_y = tst.pred_y * 0
    tst.test_set_y[numpy.array(vn_d.status.isin(['Y']))] = 1
    print(msg)
    tst.getMetrics()
Beispiel #3
0
from sklearn.externals import joblib
from keras.models import model_from_json

print(sys.argv)

m = sys.argv[1]
lvl = int(sys.argv[2])
n_extra = int(sys.argv[3])
prob_threshold = float(sys.argv[4])
known_vars = sys.argv[5]
extra_vars = sys.argv[6]
is_keras = bool(int(sys.argv[7]))
m_pkl = joblib.load(m)
list_of_features = m_pkl['features']

tst = train.TrainTest(known_vars, list_of_features, m_pkl['y_name'],
                      m_pkl['extra_col_names'])
# +
#                      ['DP_offspring', 'DP_father', 'DP_mother'])
tst.feature_list = list_of_features
if is_keras:
    tst.is_keras = True
tst.readDataSet()
tst.addLabels(level=lvl)

print('data_set shape is %s' % ' '.join(map(str, tst.data_set.shape)))
if tst.data_set.empty:
    sys.exit('data set is empty')

#n_extra = tst.data_set.shape[0] #roughly balanced classes
print('adding %s extra negative examples' % n_extra)
if n_extra > 0:
Beispiel #4
0
test_labels = numpy.array([], dtype=int)
pred_labels = numpy.array([], dtype=int)
test_var_id = numpy.array([], dtype=str)
test_alleles = numpy.array([], dtype=str)
pred_prob = numpy.array([], dtype=float)
dp_offspring = numpy.array([], dtype=int)
dp_father = numpy.array([], dtype=int)
dp_mother = numpy.array([], dtype=int)

for i, row in myped.ped.iterrows():
    if row['ind_id'] != child_id:
        continue
    print 'processing', i, row['ind_id']
    #myped.ped.test.iat[i]
    tst = train.TrainTest(
        row['test'], list_of_features, m_pkl['y_name'],
        m_pkl['extra_col_names'] + ['DP_offspring', 'DP_father', 'DP_mother'])
    if is_keras:
        tst.is_keras = True
    tst.feature_list = list_of_features
    tst.readDataSet()
    print 'data_set shape is ', tst.data_set.shape
    if tst.data_set.empty:
        continue
    tst.addLabels(level=lvl)
    print 'data_set shape is ', tst.data_set.shape
    print tst.data_set.label.value_counts()
    tst.dropNA('label')
    print 'data_set shape with non null labels is ', tst.data_set.shape
    if tst.is_keras:
        tst.model = model_from_json(m_pkl['model'])
Beispiel #5
0
    'LogReg_lvl9_stdFalse_cut0.5_splt0.99_500__0502_tstlvl9.csv',
    'LogReg_lvl9_stdFalse_cut0.5_splt0.99_1000__0502_tstlvl9.csv',
    'LogReg_lvl9_stdFalse_cut0.5_splt0.99_2000__0502_tstlvl9.csv',
    'LogReg_lvl9_stdFalse_cut0.5_splt0.99_5000__0502_tstlvl9.csv',
    'SVM_lvl9_stdFalse_cut0.5_splt0.99_300__0502_linear_1_balanced_tstlvl9.csv',
    'SVM_lvl9_stdFalse_cut0.5_splt0.99_500__0502_linear_1_balanced_tstlvl9.csv',
    'SVM_lvl9_stdFalse_cut0.5_splt0.99_1000__0502_linear_1_balanced_tstlvl9.csv',
    'SVM_lvl9_stdFalse_cut0.5_splt0.99_2000__0502_linear_1_balanced_tstlvl9.csv',
    'SVM_lvl9_stdFalse_cut0.5_splt0.99_5000__0502_linear_1_balanced_tstlvl9.csv'
]

all_pred_df = pandas.DataFrame()
for d in work_dirs:
    for p in pred_list:
        tst = train.TrainTest(
            'x',
            '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt',
            ['status'], ['descr'])
        my_pred = os.path.join(d, p)
        if not os.path.isfile(my_pred):
            print(my_pred)
            continue
        df = pandas.read_csv(my_pred)
        df = df[~df.test_var_id.duplicated()]
        all_pred_df = pandas.concat([all_pred_df, df])
        tst.test_set_y = df['test_labels']
        tst.pred_y = df['pred_labels']
        # tst.pred_y = (df['pred_prob'] > cut_off).astype(int)
        tst.pred_y_prob = df['pred_prob']
        tst.getMetrics()
        metr_df = tst.perf_mertics
        metr_df['method'] = my_pred
Beispiel #6
0
if __name__ == '__main__':
    # run PCA
    #    infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped'
    known_vars = '/mnt/scratch/asalomatov/data/SSC/wes/feature_sets_01/fb/known_SNP/fb_known_snp.tsv'
    extra_vars = '/mnt/scratch/asalomatov/data/SSC/wes/feature_sets_01/fb/all_SNP/fb_all_snp.tsv'
    list_of_features = '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt'
    #    myped = ped.Ped(infile_ped, ['collection'])
    #    myped.addTestFile(field='ind_id', file_pat='/mnt/scratch/asalomatov/data/SSC/wes/feature_sets_01/fb/all_SNP/%s')
    #    myped.ped.dropna(subset=['test'], inplace=True)
    #    myped.ped.reset_index(inplace=True)
    #    print myped.ped.shape

    n_extra = int(sys.argv[1])
    lvl = int(sys.argv[2])
    dim = int(sys.argv[3])
    trn = train.TrainTest(known_vars, list_of_features, ['status'], ['descr'])
    trn.readFeatureList()
    trn.readDataSet()
    print 'data_set shape is ', trn.data_set.shape
    trn.readExtraVars(extra_vars, n_extra=n_extra)
    print 'data_set shape is ', trn.data_set.shape
    trn.addLabels(level=lvl)
    trn.dropNA('label')
    #    sys.exit(1)
    trn.data2Test()
    print 'test_set_X shape is', trn.test_set_X.shape
    if dim == 2:
        plot2DimPCA(trn.test_set_X, trn.test_set_y)
    elif dim == 3:
        plot3DimPCA(trn.test_set_X, trn.test_set_y)
    else:
Beispiel #7
0
    
#    n_extra = int(sys.argv[1])
#    feature_set_dir = sys.argv[3]
#    infile_ped = sys.argv[5]
#    trshold = float(sys.argv[9])
#    smote_type = sys.argv[10]
#    model_dir = sys.argv[11]
#    known_vars = sys.argv[12]
#    myped = ped.Ped(infile_ped)
#    myped.addTestFile(field='ind_id', file_pat=os.path.join(feature_set_dir, '%s'))
#    myped.ped.dropna(subset=['test'], inplace=True)
#    myped.ped.reset_index(inplace=True)
#    print('ped shape:')
#    print(myped.ped.shape)
    trn = train.TrainTest(input_file,
                          list_of_features,
                          ['status'],
                          ['descr'])
#                          ['descr', 'callers'])
    trn.stdize = stdize
    trn.threshold = threshold
    trn.readFeatureList()
    trn.readDataSet()
    trn.addLabels(level=lvl)

    print('data_set shape is %s' % ' '.join(map(str, trn.data_set.shape)))

#    sys.exit('Stop for now')

    if n_extra > 0:
        trn.readExtraVars(extra_vars, n_extra=n_extra)
    trn.dropNA('label')