def pklModel(self, output_dir='./'): func.runInShell('mkdir -p ' + output_dir) model_descr = {'model': self.model, 'train_var_id': self.train_set_var_id, 'stdize': self.stdize, 'features': self.feature_list, 'feature_importance': self.feature_importance, 'y_name': self.y_name, 'extra_col_names': self.extra_column_names, 'method': self.method, 'threshold': self.threshold, 'metrics': self.perf_mertics, 'roc': self.roc, 'is_keras': 0} joblib.dump(model_descr, os.path.join(output_dir, self.method + '.pkl'))
def pklModel(self, output_dir="./"): func.runInShell("mkdir -p " + output_dir) model_descr = { "model": self.model, "train_var_id": self.train_set_var_id, "stdize": self.stdize, "features": self.feature_list, "feature_importance": self.feature_importance, "y_name": self.y_name, "extra_col_names": self.extra_column_names, "method": self.method, "threshold": self.threshold, "metrics": self.perf_mertics, "roc": self.roc, "is_keras": 0, } joblib.dump(model_descr, os.path.join(output_dir, self.method + ".pkl"))
def readVcfToDF(self, sample_list=None, chunk_size=None): """read vcf file into pandas DF without parsing. If sample_list is None, it'll read all of the samples""" # after github/hammerlab/varcode/vcf but keeping sample information path = self.fname cat, compression = self.catOrzcat(path) cmd_header = ' '.join([cat, path, '| head -10000 | grep ^##']) cmd = ' '.join([cat, path, '| head -10000 | grep ^# | grep -v ^##']) self.vcf_header = func.runInShell(cmd_header, True) vcf_clmns = func.runInShell(cmd, True).split('\t') vcf_clmns = [x.strip() for x in vcf_clmns] vcf_clmns = [x.strip('#') for x in vcf_clmns] df_cols = [] df_cols_ind = [] if sample_list is None: df_cols = vcf_clmns df_cols_ind = range(len(df_cols)) else: df_cols = self.required_fields[:] df_cols_ind = range(len(df_cols)) smp_indexes = [] for smp in sample_list: smp_ind = vcf_clmns.index(smp) smp_indexes.append(smp_ind) smp_indexes.sort() for i in smp_indexes: df_cols.append(vcf_clmns[i]) df_cols_ind.append(i) self.vcf_clmns = df_cols[:] self.vcf_sample_list = [i for i in self.vcf_clmns if i not in self.required_fields] df_field_types = collections.OrderedDict() for i in df_cols: df_field_types[i] = str df_field_types['POS'] = int reader = pd.read_table( path, compression=compression, comment="#", chunksize=chunk_size, dtype=df_field_types, names=df_cols, usecols=df_cols_ind) self.variants = reader return 0
def readVcfHeader(self, vcf_file=None): if vcf_file is None: path = self.fname else: path = vcf_file cat, compression = self.catOrzcat(path) cmd_header = ' '.join([cat, path, '| head -10000 | grep ^##']) res = func.runInShell(cmd_header, True) return res
def vcfGetVEPannoClms(self, info_title='CSQ'): """From vcf file's header extract names of VEP annotation fields""" path = self.fname cat, compression = self.catOrzcat(path) print(cat) cmd = ' '.join([cat, path, '''| head -10000 | grep "^##INFO=<ID=%s"''' % info_title]) print(cmd) l = func.runInShell(cmd, True) return l
def getFieldFromVCF(row, ped_obj, field=6): ind_id = row['ind_id'] vcf = ped_obj.getIndivVCF(ind_id) cat = 'bcftools view' # if os.path.splitext(vcf)[1] == '.gz': # cat = 'zcat ' chrom = str(row['CHROM']) pos = str(row['POS']) cmd = ' '.join([cat, vcf, ':'.join([chrom, pos]), '| grep -v ^# | grep ', str(pos)]) res = func.runInShell(cmd, return_output=1) if type(res) == int: return None return res.split('\t')[field]
def readVcfToDF(self, sample_list=None, chunk_size=None): """read vcf file into pandas DF without parsing. If sample_list is None, it'll read all of the samples""" # after github/hammerlab/varcode/vcf but keeping sample information path = self.fname compression = None if path.endswith(".gz"): compression = "gzip" elif path.endswith(".bz2"): compression = "bz2" cat = 'cat' if compression is not None: cat = 'zcat' cmd = ' '.join([cat, path, '| head -10000 | grep ^# | grep -v ^##']) vcf_clmns = func.runInShell(cmd, True).split('\t') vcf_clmns = [x.strip() for x in vcf_clmns] vcf_clmns = [x.strip('#') for x in vcf_clmns] df_cols = [] df_cols_ind = [] if sample_list is None: df_cols = vcf_clmns df_cols_ind = range(len(df_cols)) else: df_cols = self.required_fields df_cols_ind = range(len(df_cols)) smp_indexes = [] for smp in sample_list: smp_ind = vcf_clmns.index(smp) smp_indexes.append(smp_ind) smp_indexes.sort() for i in smp_indexes: df_cols.append(vcf_clmns[i]) df_cols_ind.append(i) df_field_types = collections.OrderedDict() for i in df_cols: df_field_types[i] = str df_field_types['POS'] = int reader = pd.read_table( path, compression=compression, comment="#", chunksize=chunk_size, dtype=df_field_types, names=df_cols, usecols=df_cols_ind) self.variants = reader return 0
caller = sys.argv[2] output_directory = sys.argv[3] min_DP = 7 var_type = 'SNP' vcf_pat = '' if caller == 'hc': vcf_pat = '/mnt/scratch/asalomatov/data/SSC/wes/vcf/hc/%s.family.vqsr.sorted-norm.vcf' elif caller == 'fb': vcf_pat = '/mnt/scratch/asalomatov/data/SSC/wes/vcf/fb/%s.family.freebayes.sorted-norm.vcf' else: sys.exit('unknown caller, exiting... ') output_dir = os.path.join(output_directory, caller) all_dir = output_dir + '/all_' + var_type known_dir = output_dir + '/known_' + var_type func.runInShell('mkdir -p ' + all_dir) func.runInShell('mkdir -p ' + known_dir) def multi_wrap_readBamReadcount(args): return func.readBamReadcount(*args) ### SSC ped #ped = reload(ped) #func = reload(func) #features = reload(features) infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped' myped = ped.Ped(infile_ped, ['collection']) myped.addVcf(file_pat=vcf_pat)
def removeTmpDir(self): tmpdir = os.path.dirname(self.sample_features) sys.stdout.write('removing ' + tmpdir) func.runInShell('rm -rf ' + tmpdir)
# os.path.join(outp_dir, 'syn' + outp_suffix + '.csv'), index=False) # vn_diff[cols_to_output[:-2] + ['step']].to_csv( # os.path.join(outp_dir, 'lostTP' + outp_suffix + '.csv'), index=False) cfg['predictions_file'] = infile with open(os.path.join(outp_dir, 'cfg' + outp_suffix + '.yml'), 'w') as f: yaml.dump(cfg, f, default_flow_style=False) return vn[c_all_denovo] if __name__ == '__main__': infile = sys.argv[1] pref = sys.argv[2] outp_dir = sys.argv[3] config_file = sys.argv[4] func.runInShell('mkdir -p ' + outp_dir) summarizeMutations(infile, pref, outp_dir, config_file) # summarizeMutations(infile, # prefix, # outp_dir, # config_file, # exac_anno='/mnt/scratch/asalomatov/data/ExAC/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt'):
print trio_id sample_bam = myped.getIndivBAM(smpl_id) father_bam = myped.getFaBam(trio_id) mother_bam = myped.getMoBam(trio_id) chr_pos = ':'.join([row['CHROM'], '-'.join([str(int(row['POS']) - 25), str(int(row['POS']) + 25)])]) sample_snapshot_name = '_'.join([row['SP_id'], row['lab_id'], row['CHROM'], row['POS'], row['GENE']]) + '.png' print sample_bam, father_bam, mother_bam, chr_pos, sample_snapshot_name snapshot_list.append(tmpl3 % locals()) igv_scr = '\n'.join(snapshot_list) func.runInShell('mkdir -p ' + output_dir) input_file_bn = os.path.basename(input_file) script_out = os.path.join(output_dir, input_file_bn + '.igv') with open(script_out, 'w') as f: f.write(igv_scr) cmd = ' '.join([args.igv, '-b', script_out]) print cmd func.runInShell(cmd) sys.exit(1)
# vn_syn[cols_to_output[:-2]][c_new].to_csv( # os.path.join(outp_dir, 'syn' + outp_suffix + '.csv'), index=False) # vn_diff[cols_to_output[:-2] + ['step']].to_csv( # os.path.join(outp_dir, 'lostTP' + outp_suffix + '.csv'), index=False) cfg['predictions_file'] = infile with open(os.path.join(outp_dir,'cfg' + outp_suffix + '.yml'), 'w') as f: yaml.dump(cfg, f, default_flow_style=False) if __name__ == '__main__': infile = sys.argv[1] outp_dir = sys.argv[2] config_file = sys.argv[3] func.runInShell('mkdir -p ' + outp_dir) summarizeMutations(infile, outp_dir, config_file) #cfg = {'population_AF': 0.01, # 'snpeff': {'effect_lof': ['exon_loss_variant', # 'frameshift_variant', # 'stop_gained', # 'stop_lost', # 'start_lost', # 'splice_acceptor_variant'
# create temp vcf file input_file_bn = os.path.splitext( os.path.basename(f.sample_vcf))[0] sample_vcf = os.path.join(tmp_dir, f.sample_id + '.vcf') # outp_tsv = os.path.join(tmp_dir, f.sample_id + '.tsv') # add header lines that are used for de novo analysis additional_header = vrs.readVcfHeader( os.path.join(os.path.dirname(script_name), 'header_extra.txt')) both_head = vrs.vcf_header.split('\n') +\ additional_header.split('\n') both_head = [i for i in both_head if i != ''] vrs.vcf_header = '\n'.join(both_head) + '\n' vrs.saveAsVcf(sample_vcf) # annotate with VEP and snpEff logging.info('preparing to run %s' % script_name) cmd = ' '.join([script_name, sample_vcf, os.path.dirname(script_name), child_id, targ_bed, incl_make]) logging.info('Executing \n %s' % cmd) res = func.runInShell(cmd, True)
res.to_csv(os.path.join(output_dir, m_name + '.csv'), index=False) res['var_id'] = res['test_var_id'] res_u = res[~res.var_id.duplicated()] res_u.reset_index(inplace=True) res_u.ix[:, 'pred_labels'] = (res_u['pred_prob'] > prob_cutoff).astype(int) #res_u = res_u[res_u.pred_labels == 1] #outp_tsv = os.path.join(output_dir, m_name + '.tsv') outp_tsv = os.path.join(output_dir, child_id + '.tsv') func.writePredAsVcf(res_u, outp_tsv, min_DP=min_DP) script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) cmd = ' '.join([os.path.join(script_dir, 'vcf2table.sh'), outp_tsv, script_dir, child_id]) print(cmd) func.runInShell(cmd) summarizeVariants.summarizeMutations(os.path.join(output_dir, child_id + '-ann-onePline.tsv'), os.path.join(output_dir, 'denovo'), config_file) #cmd = ' '.join([os.path.join(script_dir, 'work', 'summarizeMutations.py'), # os.path.join(output_dir, child_id + '-ann-onePline.tsv'), # os.path.join(output_dir, 'denovo'), # config_file]) #func.runInShell(cmd) # work/summarizeMutations.py /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/trio003.p1_642940-ann-onePline.tsv /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/denovo cfg_spark.yml
m_name = '.'.join(m_name.split('.')[:-1]) + '_tstlvl' + str(lvl) res['method'] = m_name res = res[~res.test_var_alleles.str.contains('nan')] res.to_csv(os.path.join(output_dir, m_name + '.csv'), index=False) res['var_id'] = res['test_var_id'] res_u = res[~res.var_id.duplicated()] res_u.reset_index(inplace=True) res_u.ix[:, 'pred_labels'] = (res_u['pred_prob'] > prob_cutoff).astype(int) #res_u = res_u[res_u.pred_labels == 1] #outp_tsv = os.path.join(output_dir, m_name + '.tsv') outp_tsv = os.path.join(output_dir, child_id + '.tsv') func.writePredAsVcf(res_u, outp_tsv, min_DP=min_DP) script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) cmd = ' '.join( [os.path.join(script_dir, 'vcf2table.sh'), outp_tsv, script_dir, child_id]) print(cmd) func.runInShell(cmd) summarizeVariants.summarizeMutations( os.path.join(output_dir, child_id + '-ann-onePline.tsv'), os.path.join(output_dir, 'denovo'), config_file) #cmd = ' '.join([os.path.join(script_dir, 'work', 'summarizeMutations.py'), # os.path.join(output_dir, child_id + '-ann-onePline.tsv'), # os.path.join(output_dir, 'denovo'), # config_file]) #func.runInShell(cmd) # work/summarizeMutations.py /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/trio003.p1_642940-ann-onePline.tsv /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/denovo cfg_spark.yml
min_DP = cfg['min_DP'] var_type = cfg['variant_type'] vcf_pat = cfg['vcf_pattern'] bam_pat = cfg['bam_pattern'] bai_pat = bam_pat + '.bai' ped_file = cfg['ped_file'] ped_file_extended = cfg['ped_file_extended'] bam_readcount = cfg['bam_readcount'] genome_ref = cfg['genome_ref'] known_vars = cfg['known_variants'] output_dir = cfg['output_directory'] #output_dir_known = '' # create output dirs func.runInShell('mkdir -p ' + output_dir) if known_vars: output_dir_known = os.path.join(output_dir, 'known') func.runInShell('mkdir -p ' + output_dir_known) # wrap a funtion for use with multiprocessing def multi_wrap_readBamReadcount(args): return func.readBamReadcount(*args) # populate ped DF myped = ped.Ped(ped_file) myped.addVcf(file_pat=vcf_pat) myped.ped.dropna(subset=['vcf'], inplace=True) myped.addBam(file_pat=bam_pat)