def run_from_args(args): gs_cns = pd.read_pickle(args.cns) gs_info = pd.read_pickle(args.info) gs_fcns = pd.read_pickle(args.fcns) # ensure ordering is the same from the start gs_cns = gs_cns.reindex(gs_info.index) gs_fcns = gs_fcns.reindex(gs_info.index) sample_files = args.samples sample_files = sample_files.split(',') sample_lists = [] for fn in sample_files: samples = [line.rstrip() for line in open(fn)] sample_lists.append(samples) suffixes = args.suffix_subsets suffixes = suffixes.split(',') gender_file = args.gender_map sex_dict = { line.rstrip().split()[0]: line.rstrip().split()[1] for line in open(gender_file) } print 'starting annotation' print CM.datestring(hour=True, minute=True) if args.convert_int: for s in samples: gs_cns[s] = gs_cns[s].astype(int) output_location = args.output_dir if args.maf_only: print "annotating subset(s)" gs_info, maf_df, to_add = add_sample_set_annotations( gs_info, gs_cns, gs_fcns, sample_lists, suffixes, sex_dict, lq_adjust=args.lq_adjust, lq_union=args.lq_union) else: print 'annotating data subsets' gs_info, maf_df, to_add = add_sample_set_annotations( gs_info, gs_cns, gs_fcns, sample_lists, suffixes, sex_dict, lq_adjust=args.lq_adjust, lq_union=args.lq_union) gs_info = basic_length_annotations(gs_info) if args.intersect: print "annotating centromere/telomere distances, MHC, VDJ Regions" gs_info = annotate_filters(gs_info) # print gs_info if args.somatic: print "annotating somatic variants" spl = args.somatic.split(',') uuid = spl[0] region = spl[1] svtype = spl[2] gs_info = annotate_somatic_var(gs_info, uuid, region, svtype) if args.genes == True: print "annotating gencode genes" gs_info = annotate_gencode_genes(gs_info) if args.suffix: fn_info = os.path.join(output_location, 'gs_info' + args.suffix) fn_cns = os.path.join(output_location, 'gs_cns' + args.suffix) var_name_info = 'gs_info' + args.suffix var_name_cns = 'gs_cns' + args.suffix var_name_maf = 'gs_maf_bi' + args.suffix else: fn_info = os.path.join(output_location, 'gs_info') fn_cns = os.path.join(output_location, 'gs_cns') var_name_info = 'gs_info' var_name_cns = 'gs_cns' var_name_maf = 'gs_maf_bi' print 'data annotated' # is this necessary- probably don't need to save the cns frame again # if args.cns_reset_ind: # CM.save_dataframe(var_name_cns, gs_cns, output_location, print_vars_recorded_loc=False, reset_index = True, index = False) # else: # CM.save_dataframe(var_name_cns, gs_cns, output_location, print_vars_recorded_loc=False) CM.save_dataframe(var_name_info, gs_info, output_location, print_vars_recorded_loc=False) CM.save_dataframe(var_name_maf, maf_df, output_location, print_vars_recorded_loc=False)
def run_from_args(args): unrelated_file = args.unrelated unr_samples = [line.rstrip() for line in open(unrelated_file)] samples_file = args.samples samples = [line.rstrip() for line in open(samples_file)] # prepare chromosome list chroms = args.chroms chroms = chroms.split(',') df1, df2, df3, df4, df5, df6 = calculate_MAF_hist( args.vcf, samples, unr_samples, chroms, bed_exclude=args.exclude_bed, maf_col=args.maf_col, add_unrel_to_info=args.add_nref, vcf_name=args.output_vcf_name, output_dir=args.output_dir) if args.suffix: fn_df1 = "snv_indel_maf_full" + args.suffix fn_df2 = "snv_indel_maf_simple" + args.suffix fn_df3 = "snv_indel_lengths_full" fn_df4 = "snv_indel_lengths_simple" fn_df5 = "snv_indel_sing_maf_full" + args.suffix fn_df6 = "snv_indel_sing_maf_simple" + args.suffix else: fn_df1 = "snv_indel_maf_full" fn_df2 = "snv_indel_maf_simple" CM.save_dataframe(fn_df1, df1, args.output_dir) CM.save_dataframe(fn_df2, df2, args.output_dir) CM.save_dataframe(fn_df3, df3, args.output_dir) CM.save_dataframe(fn_df4, df4, args.output_dir) CM.save_dataframe(fn_df5, df5, args.output_dir) CM.save_dataframe(fn_df6, df6, args.output_dir)
def run_from_args(args): info = pd.read_pickle(args.info) gts = pd.read_pickle(args.lumpy_gt) caller = args.caller output_dir = args.output_dir # sample sets samples = str(args.samples) sample_files = samples.split(',') sample_lists = [] for fn in sample_files: samples = [line.rstrip() for line in open(fn)] sample_lists.append(samples) # suffixes for sample sets in info suffixes = args.suffix_subsets suffixes = suffixes.split(',') gender_file = args.gender_map sex_dict = { line.rstrip().split()[0]: line.rstrip().split()[1] for line in open(gender_file) } print 'starting annotation' print CM.datestring(hour=True, minute=True) # fix a few naming conventions fix irregularities from BND calls # dtypes will be screwed up if we don't make some helper columns info['Chr'] = info['#CHROM'].astype(str) info['Start'] = info['POS'].astype(int) try: inds = info[info.END == 'Column_Not_Present'].index.tolist() info.loc[inds, 'END'] = info.loc[inds, 'Start'] except: pass info['End'] = info['END'].astype(int) gts = gts.loc[info.index.tolist()] info = annotate_maf_sample_subsets(sample_lists, suffixes, info, gts, sex_dict) if args.intersect: print "annotating centromere/telomere distances, MHC, VDJ Regions" info = annotate_filters(info) # print gs_info # if args.somatic: # print "annotating somatic variants" # spl = args.somatic.split(',') # uuid = spl[0] # region = spl[1] # svtype = spl[2] # gs_info = annotate_somatic_var(gs_info, uuid, region, svtype) if args.genes == True: print "annotating gencode genes" info = annotate_gencode_genes(info) if args.suffix: fn_info = os.path.join(output_dir, '{}_info'.format(caller) + args.suffix) var_name_info = '{}_info'.format(caller) + args.suffix else: fn_info = os.path.join(output_dir, '{}_info'.format(caller)) var_name_info = '{}_info'.format(caller) print 'data annotated' CM.save_dataframe(var_name_info, info, output_dir, print_vars_recorded_loc=False)
def calculate_MAF_hist(fn, samples, unrel_samples, Chroms, bed_exclude=False, prefix=False, output_dir=False, maf_col='MAF_UNREL', add_unrel_to_info=False, vcf_name=False): header_end, header_line = find_header_end(fn) cols_dict = {i: header_line.index(i) for i in header_line} print "Starting MAF Histogram Extraction..." print "{} variants_processed {}".format( 0, CM.datestring(hour=True, minute=True)) variants_processed = 1000000 if bed_exclude: command = [ 'bedtools', 'intersect', '-a', fn, '-b', bed_exclude, '-v', '-header' ] F = subprocess.Popen(command, stdout=subprocess.PIPE) F = F.stdout else: F = gzip.open(fn) var_classes = ['SNV', 'INS', 'DEL'] hist_dicts = [{}, {}, {}] var_classes_simple = ['SNV', 'INDEL'] hist_dicts_simple = [{}, {}] hist_dicts_var_lengths = [{}, {}, {}] hist_dicts_var_lengths_simple = [{}, {}] count = 0 nr_var_dict = [{}, {}, {}] nr_var_dict_simple = [{}, {}] if add_unrel_to_info: VCF_OUT = open(output_dir + '/' + vcf_name, 'w') for line in F: conc = 0 disc = 0 non_ref_conc = 0 non_ref_disc = 0 count += 1 if count == header_end: # add shit to header to print if add_unrel_to_info: line = '##INFO=<ID=NREF_UNREL,Number=A,Type=Integer,Description="number of non-reference samples per allele in unrelated individuals">' VCF_OUT.write(line + '\n') line = '##INFO=<ID=NREF,Number=A,Type=Integer,Description="number of non-reference samples per allele in all samples">' VCF_OUT.write(line + '\n') if count > header_end: var_num = count - header_end nr_at_site_unrel = [] nr_at_site_all = [] line = line.rstrip() lin_spl = line.split() FORMAT = lin_spl[cols_dict['FORMAT']] INFO = lin_spl[cols_dict['INFO']] REF = lin_spl[cols_dict['REF']] ALT = lin_spl[cols_dict['ALT']] CHROM = lin_spl[cols_dict['#CHROM']] ID = lin_spl[cols_dict['ID']] POS = lin_spl[cols_dict['POS']] if var_num == variants_processed: msg = "{} variants_processed {}".format( variants_processed, CM.datestring(hour=True, minute=True)) print msg variants_processed += 1000000 if CHROM in Chroms: # Classify indels and snvs based on the alternate and reference allele column type_convert = {'SNV': 'SNV', 'INS': 'INDEL', 'DEL': 'INDEL'} # ref_len = column_len_counts(REF)[0] # alt_len_max, alt_len_min, var_lengths = column_len_counts(ALT) # fix this to split out the variants that are multi-allelic and count them all separately var_types = [] var_types, var_lengths = indel_classifier_2(REF, ALT) mafs = parse_info_col(INFO, maf_col) allele_nums = range(1, len(mafs) + 1) for vt, length, maf, an in zip(var_types, var_lengths, mafs, allele_nums): # add to hist dicts ind = var_classes.index(vt) hist_dicts[ind][maf] = hist_dicts[ind].get(maf, 0) + 1 hist_dicts_var_lengths[ ind][length] = hist_dicts_var_lengths[ind].get( length, 0) + 1 # convert to simple var class and add to other set of dicts var_simple = type_convert[vt] ind = var_classes_simple.index(var_simple) hist_dicts_simple[ind][maf] = hist_dicts_simple[ind].get( maf, 0) + 1 hist_dicts_var_lengths_simple[ind][ length] = hist_dicts_var_lengths_simple[ind].get( length, 0) + 1 nr_ur_samps = 0 nr_all_samps = 0 for samp in samples: gt = lin_spl[cols_dict[samp]].split(':')[0] if gt not in ['0/0', './.']: alleles = [int(l) for l in gt.split('/')] if alleles.count(an) > 0: nr_all_samps += 1 if samp in unrel_samples: nr_ur_samps += 1 nr_at_site_unrel.append(nr_ur_samps) nr_at_site_all.append(nr_all_samps) if nr_ur_samps == 0: nr_at_site_unrel.append(0) if nr_all_samps == 0: nr_at_site_all.append(0) # mark samples that are singleton in unrelateds (in 1 of the selection of UR) if nr_ur_samps == 1: ind = var_classes.index(vt) nr_var_dict[ind][maf] = nr_var_dict[ind].get(maf, 0) + 1 ind = var_classes_simple.index(var_simple) nr_var_dict_simple[ind][ maf] = nr_var_dict_simple[ind].get(maf, 0) + 1 # add the nref_unrel_column if desired to vcf # print the new line with that annotated onto it if add_unrel_to_info: nref_formatted_unr = ";NREF_UNREL=" + ','.join( [str(v) for v in nr_at_site_unrel]) nref_formatted_all = ";NREF=" + ','.join( [str(v) for v in nr_at_site_all]) lin_spl[cols_dict['INFO']] += nref_formatted_unr lin_spl[cols_dict['INFO']] += nref_formatted_all line = "\t".join(lin_spl) VCF_OUT.write(line + '\n') else: if add_unrel_to_info: VCF_OUT.write(line.rstrip() + '\n') if add_unrel_to_info: VCF_OUT.close() # return as df for saving later df = pd.DataFrame(hist_dicts) df['variant_types'] = var_classes df = clean_hist_dfs(df) df2 = pd.DataFrame(hist_dicts_simple) df2['variant_types'] = var_classes_simple df2 = clean_hist_dfs(df2) df3 = pd.DataFrame(hist_dicts_var_lengths) df3['variant_types'] = var_classes df3 = clean_hist_dfs(df3) df4 = pd.DataFrame(hist_dicts_var_lengths_simple) df4['variant_types'] = var_classes_simple df4 = clean_hist_dfs(df4) df5 = pd.DataFrame(nr_var_dict) df5['variant_types'] = var_classes df5 = clean_hist_dfs(df5) df6 = pd.DataFrame(nr_var_dict_simple) df6['variant_types'] = var_classes_simple df6 = clean_hist_dfs(df6) return df, df2, df3, df4, df5, df6
def run_from_args(args): gt_fn = args.gt_tsv info = pd.read_table(args.info_pkl, index_col=0) pair_fn = args.pairs id_col = args.id pairs = [line.rstrip().split() for line in open(pair_fn)] print 'calculating replication rate statistics', CM.datestring(hour=True, minute=True) # this is generally the correct column name in VCF files so I'll add it in and use it everywhere try: info['SVTYPE'] = info['cnv_class'] except: info['cnv_class'] = info.SVTYPE rr_df, collapsed_stats_df, per_pair_df = replication_lm(gt_fn, info, pairs, identifier_col=id_col) per_pair_summary = generate_per_pair_summary(per_pair_df) #cols = ['number_twins_with_var', 'number_concordant_with_var','number_discordant_with_var', 'discordance_score', 'identifier'] if args.suff_df: suff_df = str(args.suff_df) rename = {i:i + suff_df for i in cols} cols = [i + suff_df for i in cols] # might need to have some suffixes if using this script on multiple sets of data rr_df.rename(columns=rename, inplace=True) #gs_info = gs_info.join(disc_frame[cols]) output_location = args.output_dir if args.suffix: fn_info = os.path.join(output_location, 'gs_info' + args.suffix) fn_cns = os.path.join(output_location, 'gs_cns' + args.suffix) var_name_info = 'rr_per_site' + args.suffix var_name_pair_rr_summary = 'rr_summary' + args.suffix var_name_per_pair = 'rr_per_pair' + args.suffix var_name_per_pair_summary = 'rr_per_pair_summary' + args.suffix else: fn_info = os.path.join(output_location, 'gs_info') fn_cns = os.path.join(output_location, 'gs_cns') var_name_info = 'rr_per_site' var_name_pair_rr_summary = 'rr_summary' var_name_per_pair = 'rr_per_pair' var_name_per_pair_summary = 'rr_per_pair_summary' print 'calculation complete', CM.datestring(hour=True, minute=True) CM.save_dataframe(var_name_info, rr_df, output_location, print_vars_recorded_loc=False) CM.save_dataframe(var_name_pair_rr_summary, collapsed_stats_df, output_location, print_vars_recorded_loc=False) CM.save_dataframe(var_name_per_pair, per_pair_df, output_location, print_vars_recorded_loc=False) CM.save_dataframe(var_name_per_pair_summary, per_pair_summary, output_location, print_vars_recorded_loc=False)