コード例 #1
0
ファイル: train.py プロジェクト: asalomatov/variants
 def pklModel(self, output_dir='./'):
     func.runInShell('mkdir -p ' + output_dir)
     model_descr = {'model': self.model,
                    'train_var_id': self.train_set_var_id,
                    'stdize': self.stdize,
                    'features': self.feature_list,
                    'feature_importance': self.feature_importance,
                    'y_name': self.y_name,
                    'extra_col_names': self.extra_column_names,
                    'method': self.method,
                    'threshold': self.threshold,
                    'metrics': self.perf_mertics,
                    'roc': self.roc,
                    'is_keras': 0}
     joblib.dump(model_descr, os.path.join(output_dir, self.method + '.pkl'))
コード例 #2
0
ファイル: train.py プロジェクト: asalomatov/variants
 def pklModel(self, output_dir="./"):
     func.runInShell("mkdir -p " + output_dir)
     model_descr = {
         "model": self.model,
         "train_var_id": self.train_set_var_id,
         "stdize": self.stdize,
         "features": self.feature_list,
         "feature_importance": self.feature_importance,
         "y_name": self.y_name,
         "extra_col_names": self.extra_column_names,
         "method": self.method,
         "threshold": self.threshold,
         "metrics": self.perf_mertics,
         "roc": self.roc,
         "is_keras": 0,
     }
     joblib.dump(model_descr, os.path.join(output_dir, self.method + ".pkl"))
コード例 #3
0
ファイル: variants.py プロジェクト: asalomatov/variants
 def readVcfToDF(self, sample_list=None, chunk_size=None):
     """read vcf file into pandas DF without parsing.
     If sample_list is None, it'll read all of the samples"""
     # after github/hammerlab/varcode/vcf but keeping sample information
     path = self.fname
     cat, compression = self.catOrzcat(path)
     cmd_header = ' '.join([cat, path, '| head -10000 | grep ^##'])
     cmd = ' '.join([cat, path, '| head -10000 | grep ^# | grep -v ^##'])
     self.vcf_header = func.runInShell(cmd_header, True)
     vcf_clmns = func.runInShell(cmd, True).split('\t')
     vcf_clmns = [x.strip() for x in vcf_clmns]
     vcf_clmns = [x.strip('#') for x in vcf_clmns]
     df_cols = []
     df_cols_ind = []
     if sample_list is None:
         df_cols = vcf_clmns
         df_cols_ind = range(len(df_cols))
     else:
         df_cols = self.required_fields[:]
         df_cols_ind = range(len(df_cols))
         smp_indexes = []
         for smp in sample_list:
             smp_ind = vcf_clmns.index(smp)
             smp_indexes.append(smp_ind)
         smp_indexes.sort()
         for i in smp_indexes:
             df_cols.append(vcf_clmns[i])
             df_cols_ind.append(i)
     self.vcf_clmns = df_cols[:]
     self.vcf_sample_list = [i for i in self.vcf_clmns
                             if i not in self.required_fields]
     df_field_types = collections.OrderedDict()
     for i in df_cols:
         df_field_types[i] = str
     df_field_types['POS'] = int
     reader = pd.read_table(
         path,
         compression=compression,
         comment="#",
         chunksize=chunk_size,
         dtype=df_field_types,
         names=df_cols,
         usecols=df_cols_ind)
     self.variants = reader
     return 0
コード例 #4
0
ファイル: variants.py プロジェクト: asalomatov/variants
 def readVcfHeader(self, vcf_file=None):
     if vcf_file is None:
         path = self.fname
     else:
         path = vcf_file
     cat, compression = self.catOrzcat(path)
     cmd_header = ' '.join([cat, path, '| head -10000 | grep ^##'])
     res = func.runInShell(cmd_header, True)
     return res
コード例 #5
0
ファイル: variants.py プロジェクト: asalomatov/variants
 def vcfGetVEPannoClms(self, info_title='CSQ'):
     """From vcf file's header extract names of VEP annotation fields"""
     path = self.fname
     cat, compression = self.catOrzcat(path)
     print(cat)
     cmd = ' '.join([cat, path,
                     '''| head -10000 | grep "^##INFO=<ID=%s"''' %
                     info_title])
     print(cmd)
     l = func.runInShell(cmd, True)
     return l
コード例 #6
0
ファイル: variants.py プロジェクト: asalomatov/variants
 def getFieldFromVCF(row, ped_obj, field=6):
     ind_id = row['ind_id']
     vcf = ped_obj.getIndivVCF(ind_id)
     cat = 'bcftools view'
 #    if os.path.splitext(vcf)[1] == '.gz':
 #        cat = 'zcat '
     chrom = str(row['CHROM'])
     pos = str(row['POS'])
     cmd = ' '.join([cat, vcf, ':'.join([chrom, pos]), '| grep -v ^# | grep ', str(pos)])
     res = func.runInShell(cmd, return_output=1)
     if type(res) == int:
         return None
     return res.split('\t')[field]
コード例 #7
0
ファイル: variants.py プロジェクト: asalomatov/variants
 def getFieldFromVCF(row, ped_obj, field=6):
     ind_id = row['ind_id']
     vcf = ped_obj.getIndivVCF(ind_id)
     cat = 'bcftools view'
 #    if os.path.splitext(vcf)[1] == '.gz':
 #        cat = 'zcat '
     chrom = str(row['CHROM'])
     pos = str(row['POS'])
     cmd = ' '.join([cat, vcf, ':'.join([chrom, pos]), '| grep -v ^# | grep ', str(pos)])
     res = func.runInShell(cmd, return_output=1)
     if type(res) == int:
         return None
     return res.split('\t')[field]
コード例 #8
0
ファイル: variants.py プロジェクト: asalomatov/variants
 def readVcfToDF(self, sample_list=None, chunk_size=None):
     """read vcf file into pandas DF without parsing.
     If sample_list is None, it'll read all of the samples"""
     # after github/hammerlab/varcode/vcf but keeping sample information
     path = self.fname
     compression = None
     if path.endswith(".gz"):
         compression = "gzip"
     elif path.endswith(".bz2"):
         compression = "bz2"
     cat = 'cat'
     if compression is not None:
         cat = 'zcat'
     cmd = ' '.join([cat, path, '| head -10000 | grep ^# | grep -v ^##'])
     vcf_clmns = func.runInShell(cmd, True).split('\t')
     vcf_clmns = [x.strip() for x in vcf_clmns]        
     vcf_clmns = [x.strip('#') for x in vcf_clmns]        
     df_cols = []
     df_cols_ind = []
     if sample_list is None:
         df_cols = vcf_clmns
         df_cols_ind = range(len(df_cols))
     else:
         df_cols = self.required_fields
         df_cols_ind = range(len(df_cols))
         smp_indexes = []
         for smp in sample_list:
             smp_ind = vcf_clmns.index(smp)
             smp_indexes.append(smp_ind)
         smp_indexes.sort()
         for i in smp_indexes:
             df_cols.append(vcf_clmns[i])
             df_cols_ind.append(i)
     df_field_types = collections.OrderedDict()
     for i in df_cols:
         df_field_types[i] = str
     df_field_types['POS'] = int
     reader = pd.read_table(
         path,
         compression=compression,
         comment="#",
         chunksize=chunk_size,
         dtype=df_field_types,
         names=df_cols,
         usecols=df_cols_ind)
     self.variants = reader
     return 0
コード例 #9
0
ファイル: prepareRegions.py プロジェクト: asalomatov/variants
caller = sys.argv[2]
output_directory = sys.argv[3]
min_DP = 7
var_type = 'SNP'
vcf_pat = ''
if caller == 'hc':
    vcf_pat = '/mnt/scratch/asalomatov/data/SSC/wes/vcf/hc/%s.family.vqsr.sorted-norm.vcf'
elif caller == 'fb':
    vcf_pat = '/mnt/scratch/asalomatov/data/SSC/wes/vcf/fb/%s.family.freebayes.sorted-norm.vcf'
else:
    sys.exit('unknown caller, exiting... ')

output_dir = os.path.join(output_directory, caller)
all_dir = output_dir + '/all_' + var_type
known_dir = output_dir + '/known_' + var_type
func.runInShell('mkdir -p ' + all_dir)
func.runInShell('mkdir -p ' + known_dir)

def multi_wrap_readBamReadcount(args):
    return func.readBamReadcount(*args)



### SSC ped
#ped = reload(ped)
#func = reload(func)
#features = reload(features)

infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped'
myped = ped.Ped(infile_ped, ['collection'])
myped.addVcf(file_pat=vcf_pat)
コード例 #10
0
ファイル: features.py プロジェクト: asalomatov/variants
 def removeTmpDir(self):
     tmpdir = os.path.dirname(self.sample_features)
     sys.stdout.write('removing ' + tmpdir)
     func.runInShell('rm -rf ' + tmpdir)
コード例 #11
0
#        os.path.join(outp_dir, 'syn' + outp_suffix + '.csv'), index=False)
#    vn_diff[cols_to_output[:-2] + ['step']].to_csv(
#        os.path.join(outp_dir, 'lostTP' + outp_suffix + '.csv'), index=False)

    cfg['predictions_file'] = infile

    with open(os.path.join(outp_dir, 'cfg' + outp_suffix + '.yml'), 'w') as f:
        yaml.dump(cfg, f, default_flow_style=False)
    return vn[c_all_denovo]

if __name__ == '__main__':
    infile = sys.argv[1]
    pref = sys.argv[2]
    outp_dir = sys.argv[3]
    config_file = sys.argv[4]
    func.runInShell('mkdir -p ' + outp_dir)
    summarizeMutations(infile,
                       pref,
                       outp_dir,
                       config_file)



# summarizeMutations(infile,
#                       prefix,
#                       outp_dir,
#                       config_file,
#                      exac_anno='/mnt/scratch/asalomatov/data/ExAC/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt'):


コード例 #12
0
ファイル: igvSnapshot.py プロジェクト: asalomatov/variants
    print trio_id
    sample_bam = myped.getIndivBAM(smpl_id)
    father_bam = myped.getFaBam(trio_id)
    mother_bam = myped.getMoBam(trio_id)
    chr_pos = ':'.join([row['CHROM'],
                        '-'.join([str(int(row['POS']) - 25),
                                  str(int(row['POS']) + 25)])])
    sample_snapshot_name = '_'.join([row['SP_id'],
                                     row['lab_id'],
                                     row['CHROM'],
                                     row['POS'],
                                     row['GENE']]) + '.png'
    print sample_bam, father_bam, mother_bam, chr_pos, sample_snapshot_name
    snapshot_list.append(tmpl3 % locals())

igv_scr = '\n'.join(snapshot_list)

func.runInShell('mkdir -p ' + output_dir)
input_file_bn = os.path.basename(input_file)
script_out = os.path.join(output_dir, input_file_bn + '.igv')
with open(script_out, 'w') as f:
    f.write(igv_scr)

cmd = ' '.join([args.igv, '-b', script_out])
print cmd
func.runInShell(cmd)

sys.exit(1)


コード例 #13
0
caller = sys.argv[2]
output_directory = sys.argv[3]
min_DP = 7
var_type = 'SNP'
vcf_pat = ''
if caller == 'hc':
    vcf_pat = '/mnt/scratch/asalomatov/data/SSC/wes/vcf/hc/%s.family.vqsr.sorted-norm.vcf'
elif caller == 'fb':
    vcf_pat = '/mnt/scratch/asalomatov/data/SSC/wes/vcf/fb/%s.family.freebayes.sorted-norm.vcf'
else:
    sys.exit('unknown caller, exiting... ')

output_dir = os.path.join(output_directory, caller)
all_dir = output_dir + '/all_' + var_type
known_dir = output_dir + '/known_' + var_type
func.runInShell('mkdir -p ' + all_dir)
func.runInShell('mkdir -p ' + known_dir)


def multi_wrap_readBamReadcount(args):
    return func.readBamReadcount(*args)


### SSC ped
#ped = reload(ped)
#func = reload(func)
#features = reload(features)

infile_ped = '/mnt/scratch/asalomatov/data/SSC/SSCped/SSC.ped'
myped = ped.Ped(infile_ped, ['collection'])
myped.addVcf(file_pat=vcf_pat)
コード例 #14
0
 def removeTmpDir(self):
     tmpdir = os.path.dirname(self.sample_features)
     sys.stdout.write('removing ' + tmpdir)
     func.runInShell('rm -rf ' + tmpdir)
コード例 #15
0
#    vn_syn[cols_to_output[:-2]][c_new].to_csv(
#        os.path.join(outp_dir, 'syn' + outp_suffix + '.csv'), index=False)
#    vn_diff[cols_to_output[:-2] + ['step']].to_csv(
#        os.path.join(outp_dir, 'lostTP' + outp_suffix + '.csv'), index=False)

    cfg['predictions_file'] = infile

    with open(os.path.join(outp_dir,'cfg' + outp_suffix + '.yml'), 'w') as f:
        yaml.dump(cfg, f, default_flow_style=False)
    

if __name__ == '__main__':
    infile = sys.argv[1]
    outp_dir = sys.argv[2]
    config_file = sys.argv[3]
    func.runInShell('mkdir -p ' + outp_dir)
    summarizeMutations(infile,
                       outp_dir,
                       config_file)





    #cfg = {'population_AF': 0.01,
    #       'snpeff': {'effect_lof': ['exon_loss_variant',
    #                                 'frameshift_variant',
    #                                 'stop_gained',
    #                                 'stop_lost',
    #                                 'start_lost',
    #                                 'splice_acceptor_variant'
コード例 #16
0
# create temp vcf file
input_file_bn = os.path.splitext(
    os.path.basename(f.sample_vcf))[0]
sample_vcf = os.path.join(tmp_dir, f.sample_id + '.vcf')
# outp_tsv = os.path.join(tmp_dir, f.sample_id + '.tsv')

# add header lines that are used for de novo analysis
additional_header = vrs.readVcfHeader(
    os.path.join(os.path.dirname(script_name),
                 'header_extra.txt'))
both_head = vrs.vcf_header.split('\n') +\
                           additional_header.split('\n')
both_head = [i for i in both_head if i != '']
vrs.vcf_header = '\n'.join(both_head) + '\n'

vrs.saveAsVcf(sample_vcf)

# annotate with VEP and snpEff
logging.info('preparing to run %s' % script_name)

cmd = ' '.join([script_name,
                sample_vcf,
                os.path.dirname(script_name),
                child_id,
                targ_bed,
                incl_make])
logging.info('Executing \n %s' % cmd)
res = func.runInShell(cmd, True)


コード例 #17
0
ファイル: test.py プロジェクト: asalomatov/variants
res.to_csv(os.path.join(output_dir, m_name + '.csv'), index=False)
res['var_id'] = res['test_var_id']
res_u = res[~res.var_id.duplicated()]
res_u.reset_index(inplace=True)
res_u.ix[:, 'pred_labels'] = (res_u['pred_prob'] > prob_cutoff).astype(int)
#res_u = res_u[res_u.pred_labels == 1]
#outp_tsv = os.path.join(output_dir, m_name + '.tsv')
outp_tsv = os.path.join(output_dir, child_id + '.tsv')
func.writePredAsVcf(res_u, outp_tsv, min_DP=min_DP)

script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
cmd = ' '.join([os.path.join(script_dir, 'vcf2table.sh'),
               outp_tsv,
               script_dir,
               child_id])
print(cmd)
func.runInShell(cmd)

summarizeVariants.summarizeMutations(os.path.join(output_dir, child_id + '-ann-onePline.tsv'),
                                                  os.path.join(output_dir, 'denovo'),
                                                  config_file)


#cmd = ' '.join([os.path.join(script_dir, 'work', 'summarizeMutations.py'),
#               os.path.join(output_dir, child_id + '-ann-onePline.tsv'),
#               os.path.join(output_dir, 'denovo'),
#                config_file])
#func.runInShell(cmd)

# work/summarizeMutations.py /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/trio003.p1_642940-ann-onePline.tsv /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/denovo cfg_spark.yml
コード例 #18
0
ファイル: test.py プロジェクト: asalomatov/variants
m_name = '.'.join(m_name.split('.')[:-1]) + '_tstlvl' + str(lvl)
res['method'] = m_name
res = res[~res.test_var_alleles.str.contains('nan')]
res.to_csv(os.path.join(output_dir, m_name + '.csv'), index=False)
res['var_id'] = res['test_var_id']
res_u = res[~res.var_id.duplicated()]
res_u.reset_index(inplace=True)
res_u.ix[:, 'pred_labels'] = (res_u['pred_prob'] > prob_cutoff).astype(int)
#res_u = res_u[res_u.pred_labels == 1]
#outp_tsv = os.path.join(output_dir, m_name + '.tsv')
outp_tsv = os.path.join(output_dir, child_id + '.tsv')
func.writePredAsVcf(res_u, outp_tsv, min_DP=min_DP)

script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
cmd = ' '.join(
    [os.path.join(script_dir, 'vcf2table.sh'), outp_tsv, script_dir, child_id])
print(cmd)
func.runInShell(cmd)

summarizeVariants.summarizeMutations(
    os.path.join(output_dir, child_id + '-ann-onePline.tsv'),
    os.path.join(output_dir, 'denovo'), config_file)

#cmd = ' '.join([os.path.join(script_dir, 'work', 'summarizeMutations.py'),
#               os.path.join(output_dir, child_id + '-ann-onePline.tsv'),
#               os.path.join(output_dir, 'denovo'),
#                config_file])
#func.runInShell(cmd)

# work/summarizeMutations.py /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/trio003.p1_642940-ann-onePline.tsv /mnt/xfs1/home/asalomatov/projects/spark/feature_sets/hc/denovo cfg_spark.yml
コード例 #19
0
min_DP = cfg['min_DP']
var_type = cfg['variant_type']
vcf_pat = cfg['vcf_pattern']
bam_pat = cfg['bam_pattern']
bai_pat = bam_pat + '.bai'
ped_file = cfg['ped_file']
ped_file_extended = cfg['ped_file_extended']
bam_readcount = cfg['bam_readcount']
genome_ref = cfg['genome_ref']
known_vars = cfg['known_variants']
output_dir = cfg['output_directory']
#output_dir_known = ''

# create output dirs
func.runInShell('mkdir -p ' + output_dir)
if known_vars:
    output_dir_known = os.path.join(output_dir, 'known')
    func.runInShell('mkdir -p ' + output_dir_known)


# wrap a funtion for use with multiprocessing
def multi_wrap_readBamReadcount(args):
    return func.readBamReadcount(*args)


# populate ped DF
myped = ped.Ped(ped_file)
myped.addVcf(file_pat=vcf_pat)
myped.ped.dropna(subset=['vcf'], inplace=True)
myped.addBam(file_pat=bam_pat)