Beispiel #1
0
def get_params_cluster_step(sample, cfg, out, pp_file, param_file, XX, XY):
    '''
    Load in paramaters used in cluster step from config file
    '''

    Config = ConfigParser.ConfigParser()
    cfg_file = Config.read(cfg)

    if len(cfg_file) == 0:
        raise ValueError('No configuration file found')

    mean_cov = float(Config.get('BamParameters', 'mean_cov'))
    clus_limit = int(Config.get('ClusterParameters', 'clus_limit'))
    n_iter = int(Config.get('ClusterParameters', 'n_iter'))
    threads = int(Config.get('ClusterParameters', 'threads'))
    repeat = int(Config.get('ClusterParameters', 'repeat'))
    adjusted = string_to_bool(Config.get('ClusterParameters', 'adjusted'))
    male = string_to_bool(Config.get('ClusterParameters', 'male'))
    sv_to_sim = int(Config.get('ClusterParameters', 'sv_to_sim'))

    if XX:
        male = False
    if XY:
        male = True

    pi, pl = svp_load.get_purity_ploidy(pp_file, sample, out)
    rlen, insert, std = svp_load.get_read_params(param_file, sample, out)

    sample_params = {
        'sample': sample,
        'ploidy': pl,
        'pi': pi,
        'rlen': rlen,
        'insert': insert,
        'mean_cov': mean_cov
    }
    cluster_params = {
        'n_iter': n_iter,
        'male': male,
        'adjusted': adjusted,
        'sv_to_sim': sv_to_sim,
        'threads': threads,
        'repeat': repeat,
        'clus_limit': clus_limit
    }

    return sample_params, cluster_params
Beispiel #2
0
def get_params_cluster_step(sample, cfg, out, pp_file, param_file, XX, XY):
    '''
    Load in paramaters used in cluster step from config file
    '''

    Config = ConfigParser.ConfigParser()
    cfg_file = Config.read(cfg)

    if len(cfg_file) == 0:
        raise ValueError('No configuration file found')

    mean_cov = float(Config.get('BamParameters', 'mean_cov'))
    shape = float(Config.get('BetaParameters', 'alpha'))
    scale = float(Config.get('BetaParameters', 'beta'))
    fixed_alpha = Config.get('BetaParameters', 'fixed_alpha')
    phi_limit = float(Config.get('ClusterParameters', 'phi_limit'))
    clus_limit = int(Config.get('ClusterParameters', 'clus_limit'))
    subclone_diff = float(Config.get('ClusterParameters', 'subclone_diff'))
    hpd_alpha = float(Config.get('ClusterParameters', 'hpd_alpha'))

    n_runs = int(Config.get('ClusterParameters', 'n_runs'))
    n_iter = int(Config.get('ClusterParameters', 'n_iter'))
    burn = int(Config.get('ClusterParameters', 'burn'))
    thin = int(Config.get('ClusterParameters', 'thin'))
    threads = int(Config.get('ClusterParameters', 'threads'))
    nclus_init = Config.get('ClusterParameters', 'nclus_init')
    restrict_cnss = string_to_bool(
        Config.get('ClusterParameters', 'restrict_cnv_search_space'))

    use_map = string_to_bool(Config.get('ClusterParameters', 'map'))
    merge_clusts = string_to_bool(Config.get('ClusterParameters', 'merge'))
    cocluster = string_to_bool(Config.get('ClusterParameters', 'cocluster'))
    adjusted = string_to_bool(Config.get('ClusterParameters', 'adjusted'))
    cnv_pval = float(Config.get('ClusterParameters', 'clonal_cnv_pval'))
    adjust_phis = string_to_bool(Config.get('ClusterParameters',
                                            'adjust_phis'))
    male = string_to_bool(Config.get('ClusterParameters', 'male'))
    sv_to_sim = int(Config.get('ClusterParameters', 'sv_to_sim'))

    plot = string_to_bool(Config.get('OutputParameters', 'plot'))
    ccf_reject = float(Config.get('OutputParameters', 'ccf_reject_threshold'))
    smc_het = string_to_bool(Config.get('OutputParameters', 'smc_het'))
    fit_metric = Config.get('OutputParameters', 'fit_metric')
    cluster_penalty = int(Config.get('OutputParameters', 'cluster_penalty'))

    try:
        merge_iter = int(Config.get('ClusterParameters', 'merge_iter'))
        merge_burn = int(Config.get('ClusterParameters', 'merge_burn'))
    except ConfigParser.NoOptionError:
        merge_iter = int(round(n_iter / 4))
        merge_burn = int(round(burn / 4))

    if burn == 0 and use_map:
        print('No burn-in period specified, setting MAP to false.')
        use_map = False

    if XX:
        male = False
    if XY:
        male = True

    pi, pl = svp_load.get_purity_ploidy(pp_file, sample, out)
    rlen, insert, std = svp_load.get_read_params(param_file, sample, out)

    sample_params = {
        'sample': sample,
        'ploidy': pl,
        'pi': pi,
        'rlen': rlen,
        'insert': insert,
        'mean_cov': mean_cov
    }
    cluster_params = {
        'n_runs': n_runs,
        'n_iter': n_iter,
        'burn': burn,
        'thin': thin,
        'alpha': shape,
        'beta': scale,
        'use_map': use_map,
        'hpd_alpha': hpd_alpha,
        'fixed_alpha': fixed_alpha,
        'male': male,
        'merge_clusts': merge_clusts,
        'adjusted': adjusted,
        'phi_limit': phi_limit,
        'clus_limit': clus_limit,
        'subclone_diff': subclone_diff,
        'cocluster': cocluster,
        'clonal_cnv_pval': cnv_pval,
        'adjust_phis': adjust_phis,
        'sv_to_sim': sv_to_sim,
        'threads': threads,
        'ccf_reject': ccf_reject,
        'nclus_init': nclus_init,
        'restrict_cnss': restrict_cnss,
        'merge_iter': merge_iter,
        'merge_burn': merge_burn
    }
    output_params = {
        'plot': plot,
        'smc_het': smc_het,
        'cluster_penalty': cluster_penalty,
        'fit_metric': fit_metric
    }

    return sample_params, cluster_params, output_params
Beispiel #3
0
def run_post_assign(args):

    sample          = args.sample
    cfg             = args.cfg
    out             = sample if args.out == "" else args.out
    snv_file        = args.snv_file
    snv_format      = args.snv_format
    sv_file         = args.sv_file
    cnv_file        = args.cnv_file
    sv_filt_file    = args.sv_filt_file
    snv_filt_file   = args.snv_filt_file
    run             = args.run
    gml             = args.germline
    XX              = args.XX
    XY              = args.XY

    if out == '':
        out = sample

    Config = ConfigParser.ConfigParser()
    cfg_file = Config.read(cfg)
    if len(cfg_file)==0:
        raise ValueError('No configuration file found')

    strict_cf = string_to_bool(Config.get('FilterParameters', 'strict_cnv_filt'))
    sv_offset = int(Config.get('FilterParameters', 'sv_offset'))
    gl_th     = int(Config.get('FilterParameters', 'germline_threshold'))
    cp_th     = float(Config.get('PostAssignParameters', 'clus_percent_threshold'))
    ca_th     = int(Config.get('PostAssignParameters', 'clus_absolute_threshold'))
    rc_all    = string_to_bool(Config.get('PostAssignParameters', 'reclassify_all'))
    clus_th   = {'percent': cp_th, 'absolute': ca_th}

    dna_gain_class = Config.get('SVclasses', 'dna_gain_class').split(',')
    dna_loss_class = Config.get('SVclasses', 'dna_loss_class').split(',')
    cdefs   = {'dna_gain_class': dna_gain_class, 'dna_loss_class': dna_loss_class}

    if out != '' and not os.path.exists(out):
        raise ValueError('Specified output directory does not exist!')

    if run == 'best':
        run_dirs = next(os.walk(out))[1]
        run_dirs = [run for run in run_dirs if run.startswith('best_run') and not run.endswith('post_assign')]
        if len(run_dirs) > 1:
            raise ValueError('More than 1 best run directory exists! Please specify which run to use.')
        elif len(run_dirs) < 1:
            raise ValueError('No best run directories exist! Please specify which run to use.')
        run = run_dirs[0]

    if run.endswith("snvs"):
        sv_file=''

    rundir = '%s/%s' % (out, run)
    if not os.path.exists(rundir):
        raise OSError('Specified run directory does not exist!')

    if snv_file == '' and sv_file == '':
        raise ValueError('No variants specified!')

    if snv_file != '' and not os.path.exists(snv_file):
        raise OSError('Specified SNV file does not exist!')

    if sv_file != '' and not os.path.exists(sv_file):
        raise OSError('Specified SV file does not exist!')

    sv_filt_file = '%s/%s_filtered_svs.tsv' % (out, sample) if sv_filt_file == '' else sv_filt_file
    snv_filt_file = '%s/%s_filtered_snvs.tsv' % (out, sample) if snv_filt_file == '' else snv_filt_file

    sv_filt_df  = pd.DataFrame()
    snv_filt_df = pd.DataFrame()

    if os.path.exists(sv_filt_file):
        sv_filt_df = pd.read_csv(sv_filt_file,delimiter='\t',dtype=None,header=0,low_memory=False)
        sv_filt_df = pd.DataFrame(sv_filt_df).fillna('')

    if os.path.exists(snv_filt_file):
        snv_filt_df = pd.read_csv(snv_filt_file,delimiter='\t',dtype=None,header=0,low_memory=False)
        snv_filt_df = pd.DataFrame(snv_filt_df).fillna('')
        snv_filt_df['support'] = map(float, snv_filt_df['var'].values)

    if len(sv_filt_df) == 0 and len(snv_filt_df) == 0:
        raise ValueError('Output directory filtered variant files do not exist or are empty!')

    param_file = '%s/read_params.txt' % out
    pp_file = '%s/purity_ploidy.txt' % out

    sample_params, cluster_params, output_params = \
                   load_data.get_params_cluster_step(sample, cfg, out, pp_file, param_file, XX, XY)
    rlen, insert, insert_std = svp_load.get_read_params(param_file, sample, out)
    purity = sample_params['pi']
    ploidy = sample_params['ploidy']

    sv_df  = pd.DataFrame()
    snv_df = pd.DataFrame()

    if snv_file != '':
        if snv_format == 'sanger':
            snv_df = load_data.load_snvs_sanger(snv_file)
        elif snv_format == 'mutect':
            snv_df = load_data.load_snvs_mutect(snv_file, sample)
        elif snv_format == 'mutect_callstats':
            snv_df = load_data.load_snvs_mutect_callstats(snv_file)
        elif snv_format == 'consensus':
            snv_df = load_data.load_snvs_consensus(snv_file)
        elif snv_format == 'multisnv':
            snv_df = load_data.load_snvs_multisnv(snv_file, sample)

    if sv_file != "":
        sv_df = load_data.load_svs(sv_file)
        if gml!="":
            sv_df = filt.filter_germline(gml, sv_df, rlen, insert, gl_th)

    if cnv_file != "":
        cnv_df = load_data.load_cnvs(cnv_file)

        if len(sv_df)>0:
            print('Matching copy-numbers for SVs...')
            sv_df = filt.match_copy_numbers(sv_df,cnv_df,strict_cf,sv_offset)
            sv_df = filt.match_copy_numbers(sv_df,cnv_df,strict_cf,sv_offset,\
                    ['chr2','pos2','dir2','classification','pos1'],'gtype2')

        if len(snv_df)>0:
            print('Matching copy-numbers for SNVs...')
            snv_df = filt.match_snv_copy_numbers(snv_df,cnv_df)
            n = len(snv_df)
            snv_df['gtype'] = np.array(map(filt.remove_zero_copynumbers, snv_df.gtype.values))
            snv_df = snv_df[snv_df.gtype != '']
            print('Filtered out %d SNVs with no copy-numbers' % (n-len(snv_df)))
    else:
        print('No CNV input defined, assuming all loci major/minor allele copy-numbers are ploidy/2')

        maj_allele = round(ploidy/2) if round(ploidy) > 1 else 1
        min_allele = round(ploidy/2) if round(ploidy) > 1 else 0
        default_gtype = '%d,%d,1.0' % (maj_allele, min_allele)

        if len(sv_df)>0:
            sv_df['gtype1'] = default_gtype
            sv_df['gtype2'] = default_gtype

        if len(snv_df)>0:
            snv_df['gtype'] = default_gtype

    copied_dir = False
    pa_outdir = '%s_post_assign/' % rundir
    if len(sv_df) > 0:
        sv_to_assign = get_var_to_assign(sv_df, sv_filt_df)
        sv_to_assign = filt.adjust_sv_read_counts(sv_to_assign, purity, ploidy, 0, rlen, Config)

        post_assign_vars(sv_to_assign, sv_filt_df, rundir, sample, sample_params,
                         cluster_params, clus_th, rc_all, cdefs)

    if len(snv_df) > 0:
        snv_to_assign = get_var_to_assign(snv_df, snv_filt_df, snvs = True)

        post_assign_vars(snv_to_assign, snv_filt_df, rundir, sample, sample_params,
                         cluster_params, clus_th, rc_all, cdefs, snvs = True)

    if len(sv_df) > 0 and len(snv_df) > 0:
        amend_coclus_results(rundir, sample, sample_params)
Beispiel #4
0
def run(args):
    sample = args.sample
    svs = args.procd_svs
    gml = args.germline
    cnvs = args.cnvs
    out = args.out
    param_file = args.param_file
    snvs = args.snvs
    snv_format = args.snv_format
    pp_file = args.pp_file
    cfg = args.cfg
    blist_file = args.blist

    Config = ConfigParser.ConfigParser()
    cfg_file = Config.read(cfg)

    if len(cfg_file) == 0:
        raise ValueError('No configuration file found')

    max_cn = int(Config.get('BamParameters', 'max_cn'))
    valid_chrs = Config.get('ValidationParameters', 'chroms').split(',')
    gl_th = int(Config.get('FilterParameters', 'germline_threshold'))
    sv_offset = int(Config.get('FilterParameters', 'sv_offset'))
    minsplit = int(Config.get('FilterParameters', 'min_split'))
    minspan = int(Config.get('FilterParameters', 'min_span'))
    sizefilter = int(Config.get('FilterParameters', 'size_filter'))
    min_dep = int(Config.get('FilterParameters', 'min_dep'))
    filter_chrs = string_to_bool(
        Config.get('FilterParameters', 'filter_chroms'))
    neutral = string_to_bool(Config.get('FilterParameters', 'neutral'))
    filter_otl = string_to_bool(
        Config.get('FilterParameters', 'filter_outliers'))
    strict_cnv_filt = string_to_bool(
        Config.get('FilterParameters', 'strict_cnv_filt'))
    filter_subclonal_cnvs = string_to_bool(
        Config.get('FilterParameters', 'filter_subclonal_cnvs'))

    out = sample if out == "" else out

    if out != '' and not os.path.exists(out):
        os.makedirs(out)

    pi, ploidy = svp_load.get_purity_ploidy(pp_file, sample, out)
    rlen, insert, insert_std = svp_load.get_read_params(
        param_file, sample, out)

    blist = pd.DataFrame()
    if blist_file != '' and blist_file.lower() != 'none':
        blist = pd.DataFrame(svp_load.load_blacklist(blist_file))

    if pi < 0 or pi > 1:
        raise ValueError("Tumour purity value not between 0 and 1!")

    sv_df = pd.DataFrame()
    cnv_df = pd.DataFrame()
    snv_df = pd.DataFrame()

    if snvs != "":
        if snv_format == 'sanger':
            snv_df = load_data.load_snvs_sanger(snvs)
        elif snv_format == 'mutect':
            snv_df = load_data.load_snvs_mutect(snvs, sample)
        elif snv_format == 'mutect_callstats':
            snv_df = load_data.load_snvs_mutect_callstats(snvs)
        elif snv_format == 'consensus':
            snv_df = load_data.load_snvs_consensus(snvs)
        elif snv_format == 'multisnv':
            snv_df = load_data.load_snvs_multisnv(snvs, sample)
        snv_df = run_simple_snv_filter(snv_df, min_dep, blist, filter_chrs,
                                       valid_chrs)

    if svs != "":
        sv_df = load_data.load_svs(svs)
        sv_df = run_simple_filter(sv_df,rlen,insert,minsplit,minspan,sizefilter, \
                                  min_dep,filter_chrs,valid_chrs,blist)
        if gml != "":
            sv_df = filter_germline(gml, sv_df, rlen, insert, gl_th)

    if cnvs != "":
        cnv_df = load_data.load_cnvs(cnvs)

        if len(sv_df) > 0:
            print('Matching copy-numbers for SVs...')
            sv_df = match_copy_numbers(sv_df, cnv_df, strict_cnv_filt,
                                       sv_offset)
            sv_df = match_copy_numbers(sv_df,cnv_df,strict_cnv_filt,sv_offset,\
                    ['chr2','pos2','dir2','classification','pos1'],'gtype2')
            sv_df = run_cnv_filter(sv_df, cnvs, ploidy, neutral, filter_otl,
                                   strict_cnv_filt, filter_subclonal_cnvs,
                                   max_cn)

        if len(snv_df) > 0:
            print('Matching copy-numbers for SNVs...')
            snv_df = match_snv_copy_numbers(snv_df, cnv_df)
            snv_df = run_cnv_filter(snv_df,
                                    cnvs,
                                    ploidy,
                                    neutral,
                                    filter_otl,
                                    strict_cnv_filt,
                                    filter_subclonal_cnvs,
                                    max_cn,
                                    are_snvs=True)
    else:
        print(
            'No CNV input defined, assuming all loci major/minor allele copy-numbers are ploidy/2'
        )
        if len(sv_df) > 0:
            maj_allele = round(ploidy / 2) if round(ploidy) > 1 else 1
            min_allele = round(ploidy / 2) if round(ploidy) > 1 else 0
            default_gtype = '%d,%d,1.0' % (maj_allele, min_allele)
            sv_df['gtype1'] = default_gtype
            sv_df['gtype2'] = default_gtype
            if filter_otl:
                sv_df = run_cnv_filter(sv_df, cnvs, ploidy, neutral,
                                       filter_otl, strict_cnv_filt,
                                       filter_subclonal_cnvs, max_cn)
        if len(snv_df) > 0:
            maj_allele = round(ploidy / 2) if round(ploidy) > 1 else 1
            min_allele = round(ploidy / 2) if round(ploidy) > 1 else 0
            default_gtype = '%d,%d,1.0' % (maj_allele, min_allele)
            snv_df['gtype'] = default_gtype
            snv_df = run_cnv_filter(snv_df,
                                    cnvs,
                                    ploidy,
                                    neutral,
                                    filter_otl,
                                    strict_cnv_filt,
                                    filter_subclonal_cnvs,
                                    max_cn,
                                    are_snvs=True)

    if len(sv_df) == 0 and len(snv_df) == 0:
        raise ValueError('No variants found to output!')

    if len(sv_df) > 0:
        sv_df.index = range(len(sv_df))  #reindex
        sv_df = adjust_sv_read_counts(sv_df, pi, ploidy, min_dep, rlen, Config)
        sv_df.to_csv('%s/%s_filtered_svs.tsv' % (out, sample),
                     sep='\t',
                     index=False,
                     na_rep='')
        print('Final filtered SV count: %d' % len(sv_df))

    if len(snv_df) > 0:
        snv_df = sort_by_loc(snv_df)
        snv_df.index = range(len(snv_df))  #reindex
        snv_df.to_csv('%s/%s_filtered_snvs.tsv' % (out, sample),
                      sep='\t',
                      index=False,
                      na_rep='')
        print('Final filtered SNV count: %d' % len(snv_df))