def calcMetr(vn_df, msg=' '): tst = train.TrainTest('x', '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt', ['status'], ['descr']) vn_d = vn_df[~vn_df.var_id.duplicated()] tst.pred_y = numpy.array(vn_d['pred_labels'].astype(int)) tst.test_set_y = tst.pred_y * 0 tst.test_set_y[numpy.array(vn_d.status.isin(['Y']))] = 1 print '\n\n HHHHHHHAAAAAAA' print pandas.Series(tst.test_set_y).value_counts() print msg tst.getMetrics()
def calcMetr(vn_df, msg=' '): tst = train.TrainTest('x', '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt', ['status'], ['descr']) if vn_df.empty: return None vn_d = vn_df[~vn_df.var_id.duplicated()] tst.pred_y = numpy.array(vn_d['pred_labels'].astype(int)) tst.test_set_y = tst.pred_y * 0 tst.test_set_y[numpy.array(vn_d.status.isin(['Y']))] = 1 print(msg) tst.getMetrics()
from sklearn.externals import joblib from keras.models import model_from_json print(sys.argv) m = sys.argv[1] lvl = int(sys.argv[2]) n_extra = int(sys.argv[3]) prob_threshold = float(sys.argv[4]) known_vars = sys.argv[5] extra_vars = sys.argv[6] is_keras = bool(int(sys.argv[7])) m_pkl = joblib.load(m) list_of_features = m_pkl['features'] tst = train.TrainTest(known_vars, list_of_features, m_pkl['y_name'], m_pkl['extra_col_names']) # + # ['DP_offspring', 'DP_father', 'DP_mother']) tst.feature_list = list_of_features if is_keras: tst.is_keras = True tst.readDataSet() tst.addLabels(level=lvl) print('data_set shape is %s' % ' '.join(map(str, tst.data_set.shape))) if tst.data_set.empty: sys.exit('data set is empty') #n_extra = tst.data_set.shape[0] #roughly balanced classes print('adding %s extra negative examples' % n_extra) if n_extra > 0:
test_labels = numpy.array([], dtype=int) pred_labels = numpy.array([], dtype=int) test_var_id = numpy.array([], dtype=str) test_alleles = numpy.array([], dtype=str) pred_prob = numpy.array([], dtype=float) dp_offspring = numpy.array([], dtype=int) dp_father = numpy.array([], dtype=int) dp_mother = numpy.array([], dtype=int) for i, row in myped.ped.iterrows(): if row['ind_id'] != child_id: continue print 'processing', i, row['ind_id'] #myped.ped.test.iat[i] tst = train.TrainTest( row['test'], list_of_features, m_pkl['y_name'], m_pkl['extra_col_names'] + ['DP_offspring', 'DP_father', 'DP_mother']) if is_keras: tst.is_keras = True tst.feature_list = list_of_features tst.readDataSet() print 'data_set shape is ', tst.data_set.shape if tst.data_set.empty: continue tst.addLabels(level=lvl) print 'data_set shape is ', tst.data_set.shape print tst.data_set.label.value_counts() tst.dropNA('label') print 'data_set shape with non null labels is ', tst.data_set.shape if tst.is_keras: tst.model = model_from_json(m_pkl['model'])
'LogReg_lvl9_stdFalse_cut0.5_splt0.99_500__0502_tstlvl9.csv', 'LogReg_lvl9_stdFalse_cut0.5_splt0.99_1000__0502_tstlvl9.csv', 'LogReg_lvl9_stdFalse_cut0.5_splt0.99_2000__0502_tstlvl9.csv', 'LogReg_lvl9_stdFalse_cut0.5_splt0.99_5000__0502_tstlvl9.csv', 'SVM_lvl9_stdFalse_cut0.5_splt0.99_300__0502_linear_1_balanced_tstlvl9.csv', 'SVM_lvl9_stdFalse_cut0.5_splt0.99_500__0502_linear_1_balanced_tstlvl9.csv', 'SVM_lvl9_stdFalse_cut0.5_splt0.99_1000__0502_linear_1_balanced_tstlvl9.csv', 'SVM_lvl9_stdFalse_cut0.5_splt0.99_2000__0502_linear_1_balanced_tstlvl9.csv', 'SVM_lvl9_stdFalse_cut0.5_splt0.99_5000__0502_linear_1_balanced_tstlvl9.csv' ] all_pred_df = pandas.DataFrame() for d in work_dirs: for p in pred_list: tst = train.TrainTest( 'x', '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt', ['status'], ['descr']) my_pred = os.path.join(d, p) if not os.path.isfile(my_pred): print(my_pred) continue df = pandas.read_csv(my_pred) df = df[~df.test_var_id.duplicated()] all_pred_df = pandas.concat([all_pred_df, df]) tst.test_set_y = df['test_labels'] tst.pred_y = df['pred_labels'] # tst.pred_y = (df['pred_prob'] > cut_off).astype(int) tst.pred_y_prob = df['pred_prob'] tst.getMetrics() metr_df = tst.perf_mertics metr_df['method'] = my_pred
if __name__ == '__main__': # run PCA # infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped' known_vars = '/mnt/scratch/asalomatov/data/SSC/wes/feature_sets_01/fb/known_SNP/fb_known_snp.tsv' extra_vars = '/mnt/scratch/asalomatov/data/SSC/wes/feature_sets_01/fb/all_SNP/fb_all_snp.tsv' list_of_features = '/mnt/xfs1/home/asalomatov/projects/variants/variants/ssc_wes_features_noINDEL_noDP.txt' # myped = ped.Ped(infile_ped, ['collection']) # myped.addTestFile(field='ind_id', file_pat='/mnt/scratch/asalomatov/data/SSC/wes/feature_sets_01/fb/all_SNP/%s') # myped.ped.dropna(subset=['test'], inplace=True) # myped.ped.reset_index(inplace=True) # print myped.ped.shape n_extra = int(sys.argv[1]) lvl = int(sys.argv[2]) dim = int(sys.argv[3]) trn = train.TrainTest(known_vars, list_of_features, ['status'], ['descr']) trn.readFeatureList() trn.readDataSet() print 'data_set shape is ', trn.data_set.shape trn.readExtraVars(extra_vars, n_extra=n_extra) print 'data_set shape is ', trn.data_set.shape trn.addLabels(level=lvl) trn.dropNA('label') # sys.exit(1) trn.data2Test() print 'test_set_X shape is', trn.test_set_X.shape if dim == 2: plot2DimPCA(trn.test_set_X, trn.test_set_y) elif dim == 3: plot3DimPCA(trn.test_set_X, trn.test_set_y) else:
# n_extra = int(sys.argv[1]) # feature_set_dir = sys.argv[3] # infile_ped = sys.argv[5] # trshold = float(sys.argv[9]) # smote_type = sys.argv[10] # model_dir = sys.argv[11] # known_vars = sys.argv[12] # myped = ped.Ped(infile_ped) # myped.addTestFile(field='ind_id', file_pat=os.path.join(feature_set_dir, '%s')) # myped.ped.dropna(subset=['test'], inplace=True) # myped.ped.reset_index(inplace=True) # print('ped shape:') # print(myped.ped.shape) trn = train.TrainTest(input_file, list_of_features, ['status'], ['descr']) # ['descr', 'callers']) trn.stdize = stdize trn.threshold = threshold trn.readFeatureList() trn.readDataSet() trn.addLabels(level=lvl) print('data_set shape is %s' % ' '.join(map(str, trn.data_set.shape))) # sys.exit('Stop for now') if n_extra > 0: trn.readExtraVars(extra_vars, n_extra=n_extra) trn.dropNA('label')