Ejemplo n.º 1
0
def run_from_args(args):
    gs_cns = pd.read_pickle(args.cns)
    gs_info = pd.read_pickle(args.info)
    gs_fcns = pd.read_pickle(args.fcns)

    # ensure ordering is the same from the start
    gs_cns = gs_cns.reindex(gs_info.index)
    gs_fcns = gs_fcns.reindex(gs_info.index)

    sample_files = args.samples
    sample_files = sample_files.split(',')

    sample_lists = []
    for fn in sample_files:
        samples = [line.rstrip() for line in open(fn)]
        sample_lists.append(samples)

    suffixes = args.suffix_subsets
    suffixes = suffixes.split(',')

    gender_file = args.gender_map

    sex_dict = {
        line.rstrip().split()[0]: line.rstrip().split()[1]
        for line in open(gender_file)
    }

    print 'starting annotation'
    print CM.datestring(hour=True, minute=True)

    if args.convert_int:
        for s in samples:
            gs_cns[s] = gs_cns[s].astype(int)

    output_location = args.output_dir

    if args.maf_only:
        print "annotating subset(s)"

        gs_info, maf_df, to_add = add_sample_set_annotations(
            gs_info,
            gs_cns,
            gs_fcns,
            sample_lists,
            suffixes,
            sex_dict,
            lq_adjust=args.lq_adjust,
            lq_union=args.lq_union)

    else:
        print 'annotating data subsets'

        gs_info, maf_df, to_add = add_sample_set_annotations(
            gs_info,
            gs_cns,
            gs_fcns,
            sample_lists,
            suffixes,
            sex_dict,
            lq_adjust=args.lq_adjust,
            lq_union=args.lq_union)

        gs_info = basic_length_annotations(gs_info)

        if args.intersect:
            print "annotating centromere/telomere distances, MHC, VDJ Regions"
            gs_info = annotate_filters(gs_info)
    #         print gs_info

        if args.somatic:
            print "annotating somatic variants"
            spl = args.somatic.split(',')
            uuid = spl[0]
            region = spl[1]
            svtype = spl[2]
            gs_info = annotate_somatic_var(gs_info, uuid, region, svtype)

        if args.genes == True:
            print "annotating gencode genes"
            gs_info = annotate_gencode_genes(gs_info)

    if args.suffix:
        fn_info = os.path.join(output_location, 'gs_info' + args.suffix)
        fn_cns = os.path.join(output_location, 'gs_cns' + args.suffix)

        var_name_info = 'gs_info' + args.suffix
        var_name_cns = 'gs_cns' + args.suffix
        var_name_maf = 'gs_maf_bi' + args.suffix

    else:
        fn_info = os.path.join(output_location, 'gs_info')
        fn_cns = os.path.join(output_location, 'gs_cns')
        var_name_info = 'gs_info'
        var_name_cns = 'gs_cns'
        var_name_maf = 'gs_maf_bi'

    print 'data annotated'
    # is this necessary- probably don't need to save the cns frame again
    #     if args.cns_reset_ind:
    #         CM.save_dataframe(var_name_cns, gs_cns, output_location, print_vars_recorded_loc=False, reset_index = True, index = False)

    #     else:
    #         CM.save_dataframe(var_name_cns, gs_cns, output_location, print_vars_recorded_loc=False)

    CM.save_dataframe(var_name_info,
                      gs_info,
                      output_location,
                      print_vars_recorded_loc=False)
    CM.save_dataframe(var_name_maf,
                      maf_df,
                      output_location,
                      print_vars_recorded_loc=False)
def run_from_args(args):

    unrelated_file = args.unrelated
    unr_samples = [line.rstrip() for line in open(unrelated_file)]

    samples_file = args.samples
    samples = [line.rstrip() for line in open(samples_file)]

    # prepare chromosome list
    chroms = args.chroms
    chroms = chroms.split(',')

    df1, df2, df3, df4, df5, df6 = calculate_MAF_hist(
        args.vcf,
        samples,
        unr_samples,
        chroms,
        bed_exclude=args.exclude_bed,
        maf_col=args.maf_col,
        add_unrel_to_info=args.add_nref,
        vcf_name=args.output_vcf_name,
        output_dir=args.output_dir)

    if args.suffix:
        fn_df1 = "snv_indel_maf_full" + args.suffix
        fn_df2 = "snv_indel_maf_simple" + args.suffix
        fn_df3 = "snv_indel_lengths_full"
        fn_df4 = "snv_indel_lengths_simple"

        fn_df5 = "snv_indel_sing_maf_full" + args.suffix
        fn_df6 = "snv_indel_sing_maf_simple" + args.suffix

    else:

        fn_df1 = "snv_indel_maf_full"
        fn_df2 = "snv_indel_maf_simple"

    CM.save_dataframe(fn_df1, df1, args.output_dir)
    CM.save_dataframe(fn_df2, df2, args.output_dir)

    CM.save_dataframe(fn_df3, df3, args.output_dir)
    CM.save_dataframe(fn_df4, df4, args.output_dir)

    CM.save_dataframe(fn_df5, df5, args.output_dir)
    CM.save_dataframe(fn_df6, df6, args.output_dir)
Ejemplo n.º 3
0
def run_from_args(args):
    info = pd.read_pickle(args.info)
    gts = pd.read_pickle(args.lumpy_gt)
    caller = args.caller
    output_dir = args.output_dir

    # sample sets
    samples = str(args.samples)
    sample_files = samples.split(',')

    sample_lists = []
    for fn in sample_files:
        samples = [line.rstrip() for line in open(fn)]
        sample_lists.append(samples)

    # suffixes for sample sets in info
    suffixes = args.suffix_subsets
    suffixes = suffixes.split(',')

    gender_file = args.gender_map

    sex_dict = {
        line.rstrip().split()[0]: line.rstrip().split()[1]
        for line in open(gender_file)
    }

    print 'starting annotation'
    print CM.datestring(hour=True, minute=True)

    # fix a few naming conventions fix irregularities from BND calls
    # dtypes will be screwed up if we don't make some helper columns

    info['Chr'] = info['#CHROM'].astype(str)
    info['Start'] = info['POS'].astype(int)

    try:

        inds = info[info.END == 'Column_Not_Present'].index.tolist()
        info.loc[inds, 'END'] = info.loc[inds, 'Start']
    except:
        pass

    info['End'] = info['END'].astype(int)

    gts = gts.loc[info.index.tolist()]
    info = annotate_maf_sample_subsets(sample_lists, suffixes, info, gts,
                                       sex_dict)

    if args.intersect:
        print "annotating centromere/telomere distances, MHC, VDJ Regions"
        info = annotate_filters(info)
#         print gs_info

#     if args.somatic:
#         print "annotating somatic variants"
#         spl = args.somatic.split(',')
#         uuid = spl[0]
#         region = spl[1]
#         svtype = spl[2]
#         gs_info = annotate_somatic_var(gs_info, uuid, region, svtype)

    if args.genes == True:
        print "annotating gencode genes"
        info = annotate_gencode_genes(info)

    if args.suffix:
        fn_info = os.path.join(output_dir,
                               '{}_info'.format(caller) + args.suffix)
        var_name_info = '{}_info'.format(caller) + args.suffix

    else:
        fn_info = os.path.join(output_dir, '{}_info'.format(caller))
        var_name_info = '{}_info'.format(caller)

    print 'data annotated'
    CM.save_dataframe(var_name_info,
                      info,
                      output_dir,
                      print_vars_recorded_loc=False)
def calculate_MAF_hist(fn,
                       samples,
                       unrel_samples,
                       Chroms,
                       bed_exclude=False,
                       prefix=False,
                       output_dir=False,
                       maf_col='MAF_UNREL',
                       add_unrel_to_info=False,
                       vcf_name=False):

    header_end, header_line = find_header_end(fn)

    cols_dict = {i: header_line.index(i) for i in header_line}

    print "Starting MAF Histogram Extraction..."
    print "{} variants_processed {}".format(
        0, CM.datestring(hour=True, minute=True))
    variants_processed = 1000000

    if bed_exclude:
        command = [
            'bedtools', 'intersect', '-a', fn, '-b', bed_exclude, '-v',
            '-header'
        ]
        F = subprocess.Popen(command, stdout=subprocess.PIPE)
        F = F.stdout
    else:
        F = gzip.open(fn)

    var_classes = ['SNV', 'INS', 'DEL']
    hist_dicts = [{}, {}, {}]

    var_classes_simple = ['SNV', 'INDEL']
    hist_dicts_simple = [{}, {}]

    hist_dicts_var_lengths = [{}, {}, {}]
    hist_dicts_var_lengths_simple = [{}, {}]

    count = 0

    nr_var_dict = [{}, {}, {}]
    nr_var_dict_simple = [{}, {}]

    if add_unrel_to_info:
        VCF_OUT = open(output_dir + '/' + vcf_name, 'w')

    for line in F:
        conc = 0
        disc = 0
        non_ref_conc = 0
        non_ref_disc = 0
        count += 1

        if count == header_end:

            # add shit to header to print

            if add_unrel_to_info:
                line = '##INFO=<ID=NREF_UNREL,Number=A,Type=Integer,Description="number of non-reference samples per allele in unrelated individuals">'
                VCF_OUT.write(line + '\n')
                line = '##INFO=<ID=NREF,Number=A,Type=Integer,Description="number of non-reference samples per allele in all samples">'
                VCF_OUT.write(line + '\n')

        if count > header_end:

            var_num = count - header_end

            nr_at_site_unrel = []
            nr_at_site_all = []

            line = line.rstrip()
            lin_spl = line.split()
            FORMAT = lin_spl[cols_dict['FORMAT']]
            INFO = lin_spl[cols_dict['INFO']]
            REF = lin_spl[cols_dict['REF']]
            ALT = lin_spl[cols_dict['ALT']]
            CHROM = lin_spl[cols_dict['#CHROM']]
            ID = lin_spl[cols_dict['ID']]
            POS = lin_spl[cols_dict['POS']]

            if var_num == variants_processed:
                msg = "{} variants_processed {}".format(
                    variants_processed, CM.datestring(hour=True, minute=True))
                print msg

                variants_processed += 1000000

            if CHROM in Chroms:

                # Classify indels and snvs based on the alternate and reference allele column

                type_convert = {'SNV': 'SNV', 'INS': 'INDEL', 'DEL': 'INDEL'}

                #                 ref_len = column_len_counts(REF)[0]
                #                 alt_len_max, alt_len_min, var_lengths = column_len_counts(ALT)

                # fix this to split out the variants that are multi-allelic and count them all separately

                var_types = []

                var_types, var_lengths = indel_classifier_2(REF, ALT)

                mafs = parse_info_col(INFO, maf_col)

                allele_nums = range(1, len(mafs) + 1)

                for vt, length, maf, an in zip(var_types, var_lengths, mafs,
                                               allele_nums):

                    # add to hist dicts
                    ind = var_classes.index(vt)

                    hist_dicts[ind][maf] = hist_dicts[ind].get(maf, 0) + 1

                    hist_dicts_var_lengths[
                        ind][length] = hist_dicts_var_lengths[ind].get(
                            length, 0) + 1

                    # convert to simple var class and add to other set of dicts
                    var_simple = type_convert[vt]

                    ind = var_classes_simple.index(var_simple)
                    hist_dicts_simple[ind][maf] = hist_dicts_simple[ind].get(
                        maf, 0) + 1
                    hist_dicts_var_lengths_simple[ind][
                        length] = hist_dicts_var_lengths_simple[ind].get(
                            length, 0) + 1

                    nr_ur_samps = 0
                    nr_all_samps = 0
                    for samp in samples:
                        gt = lin_spl[cols_dict[samp]].split(':')[0]
                        if gt not in ['0/0', './.']:
                            alleles = [int(l) for l in gt.split('/')]
                            if alleles.count(an) > 0:
                                nr_all_samps += 1
                                if samp in unrel_samples:
                                    nr_ur_samps += 1

                    nr_at_site_unrel.append(nr_ur_samps)
                    nr_at_site_all.append(nr_all_samps)

                    if nr_ur_samps == 0:
                        nr_at_site_unrel.append(0)

                    if nr_all_samps == 0:
                        nr_at_site_all.append(0)

                    # mark samples that are singleton in unrelateds (in 1 of the selection of UR)
                    if nr_ur_samps == 1:
                        ind = var_classes.index(vt)
                        nr_var_dict[ind][maf] = nr_var_dict[ind].get(maf,
                                                                     0) + 1

                        ind = var_classes_simple.index(var_simple)
                        nr_var_dict_simple[ind][
                            maf] = nr_var_dict_simple[ind].get(maf, 0) + 1

                    # add the nref_unrel_column if desired to vcf
                    # print the new line with that annotated onto it
                if add_unrel_to_info:
                    nref_formatted_unr = ";NREF_UNREL=" + ','.join(
                        [str(v) for v in nr_at_site_unrel])
                    nref_formatted_all = ";NREF=" + ','.join(
                        [str(v) for v in nr_at_site_all])

                    lin_spl[cols_dict['INFO']] += nref_formatted_unr
                    lin_spl[cols_dict['INFO']] += nref_formatted_all

                    line = "\t".join(lin_spl)
                    VCF_OUT.write(line + '\n')

        else:
            if add_unrel_to_info:
                VCF_OUT.write(line.rstrip() + '\n')

    if add_unrel_to_info:
        VCF_OUT.close()

    # return as df for saving later

    df = pd.DataFrame(hist_dicts)
    df['variant_types'] = var_classes
    df = clean_hist_dfs(df)

    df2 = pd.DataFrame(hist_dicts_simple)
    df2['variant_types'] = var_classes_simple
    df2 = clean_hist_dfs(df2)

    df3 = pd.DataFrame(hist_dicts_var_lengths)
    df3['variant_types'] = var_classes
    df3 = clean_hist_dfs(df3)

    df4 = pd.DataFrame(hist_dicts_var_lengths_simple)
    df4['variant_types'] = var_classes_simple
    df4 = clean_hist_dfs(df4)

    df5 = pd.DataFrame(nr_var_dict)
    df5['variant_types'] = var_classes
    df5 = clean_hist_dfs(df5)

    df6 = pd.DataFrame(nr_var_dict_simple)
    df6['variant_types'] = var_classes_simple
    df6 = clean_hist_dfs(df6)

    return df, df2, df3, df4, df5, df6
def run_from_args(args):
    gt_fn = args.gt_tsv
    info = pd.read_table(args.info_pkl, index_col=0)
    
    
    pair_fn = args.pairs
    id_col = args.id
    
    
    pairs = [line.rstrip().split() for line in open(pair_fn)]
    
    
    
    print 'calculating replication rate statistics', CM.datestring(hour=True, minute=True)
    
    # this is generally the correct column name in VCF files so I'll add it in and use it everywhere
    
    try:
        info['SVTYPE'] = info['cnv_class']
        
    except:
        info['cnv_class'] = info.SVTYPE
        
        
    rr_df, collapsed_stats_df, per_pair_df = replication_lm(gt_fn, info, pairs, identifier_col=id_col)
    per_pair_summary = generate_per_pair_summary(per_pair_df)
    
    #cols = ['number_twins_with_var', 'number_concordant_with_var','number_discordant_with_var', 'discordance_score', 'identifier']
    
    
    if args.suff_df:
        suff_df = str(args.suff_df)
        rename = {i:i + suff_df for i in cols}
        cols = [i + suff_df for i in cols]
        
        # might need to have some suffixes if using this script on multiple sets of data
        rr_df.rename(columns=rename, inplace=True)
    
    #gs_info = gs_info.join(disc_frame[cols])
    
    output_location = args.output_dir
    
    if args.suffix:
        fn_info = os.path.join(output_location, 'gs_info' + args.suffix)
        fn_cns = os.path.join(output_location, 'gs_cns' + args.suffix)
        
        var_name_info = 'rr_per_site' + args.suffix
        var_name_pair_rr_summary = 'rr_summary' + args.suffix
        var_name_per_pair = 'rr_per_pair' + args.suffix
        var_name_per_pair_summary = 'rr_per_pair_summary' + args.suffix
    
    else:
        fn_info = os.path.join(output_location, 'gs_info')
        fn_cns = os.path.join(output_location, 'gs_cns')
        var_name_info = 'rr_per_site'
        var_name_pair_rr_summary = 'rr_summary'
        var_name_per_pair = 'rr_per_pair'
        var_name_per_pair_summary = 'rr_per_pair_summary'
        
    
    
    print 'calculation complete',  CM.datestring(hour=True, minute=True)
    
    CM.save_dataframe(var_name_info, rr_df, output_location, print_vars_recorded_loc=False)
    CM.save_dataframe(var_name_pair_rr_summary, collapsed_stats_df, output_location, print_vars_recorded_loc=False)
    CM.save_dataframe(var_name_per_pair, per_pair_df, output_location, print_vars_recorded_loc=False)
    CM.save_dataframe(var_name_per_pair_summary, per_pair_summary, output_location, print_vars_recorded_loc=False)