Exemple #1
0
def main(bcbio_dir, bed, depth, threads=None, isdebug=True):
    snp_file = verify_file(bed)
    depth_cutoff = depth

    log.init(isdebug)

    try:
        import az
    except ImportError:
        parallel_cfg = ParallelCfg(threads=threads)
    else:
        sys_cfg = az.init_sys_cfg()
        parallel_cfg = ParallelCfg(
            scheduler=sys_cfg.get('scheduler'),
            queue=sys_cfg.get('queue'),
            resources=sys_cfg.get('resources'),
            threads=threads or sys_cfg.get('threads'),
            tag='clearup')

    log.info('Loading bcbio project from ' + bcbio_dir)
    log.info('-' * 70)
    proj = BcbioProject()
    proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup')
    log.info('Loaded ' + proj.final_dir)
    log_dir = safe_mkdir(join(proj.log_dir, 'clearup'))
    work_dir = safe_mkdir(join(proj.work_dir, 'clearup'))
    out_dir = safe_mkdir(join(proj.date_dir, 'clearup'))
    with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view:
        genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
Exemple #2
0
def main(paths, output_dir, genome, depth):
    log.init(True)

    bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)]

    bcbio_projs = []
    dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)]
    if dirs:
        for d in dirs:
            proj = BcbioProject()
            proj.load_from_bcbio_dir(d, proc_name='clearup')
            bcbio_projs.append(proj)

    build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
Exemple #3
0
def main(host, port):
    clearup.HOST_IP = host
    clearup.POST = port
    log.init(True, join(DATA_DIR, 'log_server.txt'), save_previous=True)

    os.environ['FLASK_DEBUG'] = '1'
    # log_path = join(DATA_DIR, 'flask.log')
    # handler = RotatingFileHandler(log_path, maxBytes=10000, backupCount=10)
    # handler.setLevel(logging.INFO)
    # app.logger.addHandler(handler)

    http_server = WSGIServer((host, port), app, handler_class=WebSocketHandler)
    log.info('Starting a webserver at ' + host + ':' + str(port))
    http_server.serve_forever()
Exemple #4
0
def main(input_bed, output_file, output_features=False, genome=None,
         only_canonical=False, short=False, extended=False, high_confidence=False,
         ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False):
    """ Annotating BED file based on reference features annotations.
    """
    logger.init(is_debug_=is_debug)

    if not genome:
        raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome')

    if short:
        if extended:        raise click.BadParameter('--short and --extended can\'t be set both', param='extended')
        if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features')
    elif output_features or extended:
        extended = True
        short    = False

    if not verify_file(input_bed):
        click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed')
    input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}')

    if work_dir:
        work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0])
        safe_mkdir(work_dir)
        info(f'Created work directory {work_dir}')
    else:
        work_dir = mkdtemp('bed_annotate')
        debug('Created temporary work directory {work_dir}')

    input_bed = clean_bed(input_bed, work_dir)
    input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning')

    output_file = adjust_path(output_file)

    output_file = annotate(
        input_bed, output_file, work_dir, genome=genome,
        only_canonical=only_canonical, short=short, extended=extended,
        high_confidence=high_confidence, collapse_exons=collapse_exons,
        output_features=output_features,
        ambiguities_method=ambiguities_method, coding_only=coding_only,
        is_debug=is_debug)

    if not work_dir:
        debug(f'Removing work directory {work_dir}')
        shutil.rmtree(work_dir)

    info(f'Done, saved to {output_file}')
Exemple #5
0
from flask_script import Manager

from ngs_utils.file_utils import safe_mkdir, file_transaction, intermediate_fname, can_reuse, verify_dir
from ngs_utils.utils import is_local, is_us, is_uk
from ngs_utils import logger as log, call_process
from ngs_utils.parallel import ParallelCfg, parallel_view
from ngs_utils.bcbio import BcbioProject

from clearup.panel import get_dbsnp
from clearup.callable import batch_callable_bed
from clearup.model import Project, Sample, db, SNP, get_or_create_run, Run
from clearup import app, DATA_DIR, parallel_cfg, DEPTH_CUTOFF
from clearup.utils import bam_samplename, get_ref_fasta

manager = Manager(app)
log.init(True)


def _add_project(bam_by_sample,
                 project_name,
                 bed_file=None,
                 use_callable=False,
                 data_dir='',
                 genome='hg19',
                 min_depth=DEPTH_CUTOFF,
                 depth_by_sample=None,
                 reuse_files=False):
    fp_proj = Project.query.filter(Project.name == project_name).first()
    if fp_proj:
        fp_proj.delete(reuse_files=reuse_files)
Exemple #6
0
def main(output_dir=None,
         tumor_bam=None,
         normal_bam=None,
         normal_name=None,
         tumor_name=None,
         genome=None,
         input_genomes_url=None,
         ref_fa=None,
         viruses_fa=None,
         repeat_masker_bed=None,
         breakend_pon=None,
         bp_pon=None,
         bp_hotspots=None,
         min_tumor_af=None,
         requested_cores=None,
         unlock=False,
         dryrun=False,
         maxcoverage=None,
         chunksize_mil=None,
         jvm_heap=None,
         externalaligner=None):

    conf = {}

    output_dir = output_dir or 'gridss_results'
    output_dir = safe_mkdir(abspath(output_dir))
    log_dir = safe_mkdir(join(output_dir, 'log'))
    logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True)
    if isfile(join(output_dir, 'work', 'all.done')):
        run_simple('rm ' + join(output_dir, 'work', 'all.done'))
    conf['output_dir'] = adjust_path(output_dir)

    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    if normal_bam:
        normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
            .replace('-ready', '').replace('-sorted', '')
        conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'),
        conf['normal_name'] = normal_name
    conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option')
    conf['tumor_name'] = tumor_name

    try:
        machine_cores = len(os.sched_getaffinity(0))
    except:
        machine_cores = 1
    cores = min(machine_cores, 8)
    if requested_cores:
        cores = min(cores, requested_cores)
    conf['cores'] = cores

    if maxcoverage:
        conf['maxcoverage'] = maxcoverage
    if chunksize_mil:
        conf['chunksize_mil'] = chunksize_mil
    if jvm_heap:
        conf['jvm_heap'] = jvm_heap
    if externalaligner:
        conf['externalaligner'] = externalaligner

    conf['genome'] = genome
    try:
        from reference_data import api as refdata
    except:
        pass
    else:
        # check reference_data can find the genomes dir, and error out if not
        genomes_dir = refdata.find_genomes_dir(input_genomes_url)
        if genomes_dir:
            conf['genomes_dir'] = genomes_dir

    if ref_fa:
        if not externalaligner == 'minimap2' and not verify_file(ref_fa +
                                                                 '.bwt'):
            log.critical(f'Please, index {ref_fa} using'
                         f'    bwa index {ref_fa}')
        if not verify_file(ref_fa + '.fai'):
            log.critical(f'Please, index {ref_fa} using'
                         f'    samtools faidx {ref_fa}')
        conf['ref_fa'] = ref_fa
    if viruses_fa:
        if not externalaligner == 'minimap2' and not verify_file(viruses_fa +
                                                                 '.bwt'):
            log.critical(f'Please, index {viruses_fa} using: '
                         f'    bwa index {viruses_fa}')
        if not verify_file(viruses_fa + '.fai'):
            log.critical(f'Please, index {viruses_fa} using '
                         f'    samtools faidx {viruses_fa}')
        dict_file = viruses_fa.replace('.fa', '.dict')
        if not verify_file(dict_file):
            log.critical(f'Please, index {viruses_fa} using: '
                         f'   samtools dict {viruses_fa} -o {dict_file}')
        img_file = viruses_fa + '.img'
        if not verify_file(img_file):
            log.critical(
                f'Please, create an img file for {viruses_fa} using:\n'
                f'   gatk BwaMemIndexImageCreator -I  {viruses_fa} -O {img_file}'
            )

        conf['viruses_fa'] = verify_file(viruses_fa)
    if repeat_masker_bed:
        conf['repeat_masker_bed'] = repeat_masker_bed
    if breakend_pon:
        conf['breakend_pon'] = breakend_pon
    if bp_pon:
        conf['bp_pon'] = bp_pon
    if bp_hotspots:
        conf['bp_hotspots'] = bp_hotspots
    if min_tumor_af:
        conf['min_tumor_af'] = min_tumor_af

    py_path = sys.executable  # e.g. /miniconda/envs/umccrise_hmf/bin/python
    env_path = dirname(dirname(py_path))  # e.g. /miniconda/envs/umccrise_hmf
    found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar'))
    if not found:
        hmf_env_path = secondary_conda_env('hmf', is_critical=False)
        if hmf_env_path:
            found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))
            if not found:
                critical(
                    'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`'
                )
        conf['gridss_env'] = hmf_env_path
    conf['gridss_jar'] = found[0]

    run_snakemake(join(package_path(), 'gridss', 'Snakefile'),
                  conf,
                  cores=cores,
                  output_dir=output_dir,
                  unlock=unlock,
                  dryrun=dryrun)
def main(ctx, subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False):
    """ Generates a PNG image with a relatedness heatmap.
    """
    if not subdirs:
        ctx.fail('Provide at least on input directory.')

    datasets = _load_datasets(subdirs)

    title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + ''
    if not Params.NORMALIZE_VAR: title += ', not norm by var'
    if not Params.NORMALIZE_DIST: title += ', not norm by dist'
    if Params.SKIP_DAMAGE: title += ', skipped damage'
    if Params.SKIP_REJECT: title += ', skipped REJECT'
    if Params.SKIP_NOCALL: title += ', skipped num called = 0'
    if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF)
    if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST)
    if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions'
    else: title += ', skiped SNP pairs between regions'

    run_id = '__'.join(d.name for d in datasets)

    run_dir = safe_mkdir(join((output_dir or join(code_dir, 'runs')), run_id))
    log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True)
    work_dir = safe_mkdir(join(run_dir, 'work'))

    all_vcf_by_label = dict()
    bed_files_by_genome = defaultdict(set)
    for d in datasets:
        all_vcf_by_label.update(d.vcf_by_label)
        if d.bed_file:
            bed_files_by_genome[d.genome].add(
                d.bed_file)  # d.bed_file=None for WGS

    genome_by_label = dict()
    for d in datasets:
        for label in d.vcf_by_label:
            genome_by_label[label] = d.genome

    parallel_cfg = ParallelCfg(threads=threads)
    log.info(f'Starting using {parallel_cfg.threads} threads')

    with parallel_view(len(all_vcf_by_label), parallel_cfg,
                       work_dir) as parall_view:
        overlap_bed_file_by_genome = dict()
        if bed_files_by_genome:
            overlap_bed_file_by_genome = _prep_bed(work_dir,
                                                   bed_files_by_genome,
                                                   overlap_bed_file_by_genome)
            log.info('Slicing VCFs to regions in BED files')
            out = parall_view.run(_slice_vcf_fn, [[
                work_dir, label, vcf,
                overlap_bed_file_by_genome.get(genome_by_label[label])
            ] for label, vcf in all_vcf_by_label.items()])
            all_vcf_by_label = dict(out)
            log.info()

        log.info('Calculating fingerprints for individual samples')
        out = parall_view.run(make_fingerprint, [[
            vcf, work_dir, label, fp_size,
            overlap_bed_file_by_genome.get(genome_by_label[label])
        ] for label, vcf in all_vcf_by_label.items()])
        print_label_pairs = dict(out)
        log.info()

    log.info('Comparing fingerprints pairwise')
    pairwise_dict = defaultdict(dict)
    for ((label1, print1), (label2,
                            print2)) in it.combinations_with_replacement(
                                print_label_pairs.items(), 2):
        dist, pvalue = compare(print1, print2)
        if dist:
            log.info(
                f'   {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}')
        else:
            log.info(f'   {label1} VS {label2}: failed to calculate')
            dist = float('NaN')
        pairwise_dict[label1][label2] = dist
        pairwise_dict[label2][label1] = dist

    log.info('Plotting comparison heatmap')
    plot_heatmap(pairwise_dict, run_dir, title)
Exemple #8
0
def main(input_bed,
         output_file,
         output_features=False,
         genome=None,
         only_canonical=False,
         short=False,
         extended=False,
         high_confidence=False,
         ambiguities_method=False,
         coding_only=False,
         collapse_exons=False,
         work_dir=False,
         is_debug=False):
    """ Annotating BED file based on reference features annotations.
    """
    logger.init(is_debug_=is_debug)

    if not genome:
        raise click.BadParameter(
            'Error: please, specify genome build name with -g (e.g. `-g hg19`)',
            param='genome')

    if short:
        if extended:
            raise click.BadParameter(
                '--short and --extended can\'t be set both', param='extended')
        if output_features:
            raise click.BadParameter(
                '--short and --output-features can\'t be set both',
                param='output_features')
    elif output_features or extended:
        extended = True
        short = False

    if not verify_file(input_bed):
        click.BadParameter(
            f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]',
            param='input_bed')
    input_bed = verify_file(input_bed,
                            is_critical=True,
                            description=f'Input BED file for {__file__}')

    if work_dir:
        work_dir = join(adjust_path(work_dir),
                        os.path.splitext(basename(input_bed))[0])
        safe_mkdir(work_dir)
        info(f'Created work directory {work_dir}')
    else:
        work_dir = mkdtemp('bed_annotate')
        debug('Created temporary work directory {work_dir}')

    input_bed = clean_bed(input_bed, work_dir)
    input_bed = verify_bed(
        input_bed,
        is_critical=True,
        description=f'Input BED file for {__file__} after cleaning')

    output_file = adjust_path(output_file)

    output_file = annotate(input_bed,
                           output_file,
                           work_dir,
                           genome=genome,
                           only_canonical=only_canonical,
                           short=short,
                           extended=extended,
                           high_confidence=high_confidence,
                           collapse_exons=collapse_exons,
                           output_features=output_features,
                           ambiguities_method=ambiguities_method,
                           coding_only=coding_only,
                           is_debug=is_debug)

    if not work_dir:
        debug(f'Removing work directory {work_dir}')
        shutil.rmtree(work_dir)

    info(f'Done, saved to {output_file}')
Exemple #9
0
def main(output_dir=None,
         normal_bam=None,
         tumor_bam=None,
         snv_vcf=None,
         normal_name=None,
         tumor_name=None,
         sample=None,
         genome=None,
         genomes_dir=None,
         gridss_ref_dir=None,
         ref_fa=None,
         threads=None,
         jvmheap=None):

    gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx'))
    gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts'))

    normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    sample = sample or tumor_name

    output_dir = safe_mkdir(abspath(output_dir or 'gridss'))
    logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True)
    output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf')

    assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet'

    if genomes_dir:
        refdata.find_genomes_dir(genomes_dir)
    if not gridss_ref_dir:
        gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir')
    if not ref_fa:
        ref_fa = ref_fa.get_ref_file(genome, 'fa')

    hmf_env_path = conda_utils.secondary_conda_env('hmf')

    gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0]
    amber_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0]
    cobalt_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0]
    purple_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0]
    linx_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0]

    cmd = f"""
PATH={hmf_env_path}/bin:$PATH \
THREADS={threads} \
GRIDSS_JAR={gridss_jar} \
AMBER_JAR={amber_jar} \
COBALT_JAR={cobalt_jar} \
PURPLE_JAR={purple_jar} \
LINX_JAR={linx_jar} \
bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \
-n {normal_bam} \
-t {tumor_bam} \
-v {output_vcf} \
-s {sample} \
--normal_sample {normal_name} \
--tumour_sample {tumor_name} \
--snvvcf {snv_vcf} \
--ref_dir {gridss_ref_dir} \
--install_dir {gridss_scripts_dir} \
--reference {ref_fa} \
--output_dir {output_dir} \
{f"--jvmheap {jvmheap}" if jvmheap else ""}
""".strip()

    try:
        run_simple(cmd)
    except subprocess.SubprocessError:
        err('--------\n')
        err(f'Error running GRIDSS-PURPLE-LINX.\n')
        raise
def main(prefix,
         output_bedpe,
         output_fasta=None,
         output_json=None,
         min_read_support=None,
         ensembl_release=None,
         peptide_flanking_len=None,
         debug=False,
         no_filtering=False,
         check_transcript=True,
         pizzly_ref_fa=None,
         reads=None,
         min_tpm=None):

    # input_flat_fpath = prefix + '-flat.tsv'
    input_json_fpath = prefix + '.json'
    input_fasta = prefix + '.fusions.fasta'
    output_bedpe = abspath(output_bedpe)

    logger.init(debug)

    global ENSEMBL_RELEASE
    ENSEMBL_RELEASE = ensembl_release
    ebl = EnsemblRelease(ENSEMBL_RELEASE)

    # Reading filtered tsv
    # filt_fusions = set()
    # with open(input_flat_fpath) as f:
    #     for row in csv.DictReader(f, delimiter='\t'):
    #         filt_fusions.add((row['geneA.name'], row['geneB.name']))

    # Read json
    json_data = {'genes': []}
    with open(input_json_fpath) as f:
        data = json.load(f)
        for g_event in data['genes']:
            gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
            # if (gene_a, gene_b) in filt_fusions:
            json_data['genes'].append(g_event)

    # Read fasta
    fasta_dict = SeqIO.index(input_fasta, 'fasta')

    # First round: genomic coordinates and fasta
    logger.info(
        f'Round 1: reading {len(json_data["genes"])} gene-pairs events from pizzly JSON'
    )
    fusions = []
    for g_event in json_data[
            'genes']:  # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'}
        gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
        # logger.info(f'Processing event {gene_a}>>{gene_b}')

        met_fasta_keys = set(
        )  # collecting to get rid of duplicate transcript events
        for t_event in g_event['transcripts']:
            fusion = Fusion.create_from_pizzly_event(ebl, t_event)

            if check_transcript:
                if not _transcript_is_good(
                        fusion.side_5p.trx) or not _transcript_is_good(
                            fusion.side_3p.trx):
                    # logger.info(f'Transcripts {fusion.side_5p.trx} and {fusion.side_3p.trx} didn\'t pass check')
                    continue

            if no_filtering is not True and fusion.support < min_read_support:
                continue

            calc_positions_ok = fusion.calc_genomic_positions()
            if not calc_positions_ok:
                continue

            # comparing our fasta to pizzly fasta
            fusion.fasta_rec = fasta_dict[t_event['fasta_record']]
            _check_fusion_fasta(fusion.fasta_rec, fusion)

            # skipping duplicate fastas
            k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.fasta
            assert k not in met_fasta_keys
            met_fasta_keys.add(k)

            fusions.append(fusion)
        # if not met_fasta_keys:
        #     logger.info('   Filtered all fusions for this gene pair.')
        if met_fasta_keys:
            logger.info(
                f'Keeping {len(met_fasta_keys)} fusion(s) for the event {gene_a}-{gene_b}'
            )

    if not fusions:
        logger.warn('Finished: no fusions passed filtering')

    # Calculate expression of fused transcripts
    expr_by_fusion = None
    if reads and fusions:
        # filtered fasta for re-calling expression
        work_dir = safe_mkdir(splitext(output_bedpe)[0] + '_quant')
        fasta_path = join(work_dir, 'fusions.fasta')
        fasta_recs = [f.fasta_rec for f in fusions]
        SeqIO.write(fasta_recs, fasta_path, 'fasta')

        if pizzly_ref_fa:
            expr_by_fusion = requanitify_pizzly(pizzly_ref_fa, fasta_path,
                                                work_dir, reads)
            # expr_by_fusion = {fusion-fasta-id -> {length  eff_length  est_counts   tpm}}

    # Second round: peptides and expression
    logger.info()
    logger.info(
        f'Round 2: making peptides for {len(fusions)} events in '
        f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in fusions]))} genes pairs'
    )
    met_peptide_keys = set()  # collecting to get rid of duplicate peptides
    bedpe_entries = []
    peptide_fusions = []
    if peptide_flanking_len < 0:
        peptide_flanking_len = None
    for fusion in fusions:
        if fusion.side_3p.trx.contains_start_codon:
            logger.info(
                f'Translating {fusion.side_5p.trx.gene.name}>>{fusion.side_3p.trx.gene.name} fusion: {fusion}'
            )
            fusion.make_peptide(peptide_flanking_len)
            if fusion.peptide:
                _verify_peptides(fusion.fasta_rec, fusion,
                                 peptide_flanking_len)

            # skipping duplicate peptides
            k = fusion.side_5p.trx.gene.name, fusion.side_3p.trx.gene.name, fusion.peptide
            if k in met_peptide_keys:
                logger.debug(f'Skipping peptide {k}: already added')
                continue
            met_peptide_keys.add(k)

        # writing bedpe
        entry = fusion.to_bedpe()

        # add expression
        if expr_by_fusion:
            entry.update(expr_by_fusion[fusion.fasta_rec.id])
            tpm = float(entry['tpm'])
            if no_filtering is not True and tpm < min_tpm:
                logger.debug(
                    f'Skipping peptide {entry}: TPM={tpm} is below {min_tpm}')
                continue

        if fusion.peptide:
            peptide_fusions.append(fusion)
        bedpe_entries.append(entry)

    # Writing bedpe
    with open(output_bedpe, 'w') as bedpe_fh:
        bedpe_header = [
            'chr 5p',
            'start 5p',
            'end 5p',
            'chr 3p',
            'start 3p',
            'end 3p',
            'name',
            'tier',
            'strand 5p',
            'strand 3p',
            'support',
            'is canon bndry',
            'inframe',
            'peptide',
            'fusion pos',
            'nt in the break',
            'transcripts',
            'is canon intron dinuc',
        ]
        if expr_by_fusion:
            bedpe_header.extend(list(expr_by_fusion.values())[0].keys())
        bedpe_writer = csv.DictWriter(bedpe_fh,
                                      fieldnames=bedpe_header,
                                      delimiter='\t')
        bedpe_writer.writeheader()
        for bedpe_entry in bedpe_entries:
            bedpe_writer.writerow(bedpe_entry)

    # _test_pvac(output_bedpe)

    # Write fasta
    if output_fasta:
        SeqIO.write([f.fasta_rec for f in peptide_fusions], output_fasta,
                    'fasta')

    logger.info()
    logger.info(
        f'Written {len(peptide_fusions)} fusions in '
        f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in peptide_fusions]))} '
        f'gene pairs good peptides bedpe: {output_bedpe}')
def main(subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False):
    """ Generates a PNG image with a relatedness heatmap.
    """
    datasets = _load_datasets(subdirs)

    title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + ''
    if not Params.NORMALIZE_VAR: title += ', not norm by var'
    if not Params.NORMALIZE_DIST: title += ', not norm by dist'
    if Params.SKIP_DAMAGE: title += ', skipped damage'
    if Params.SKIP_REJECT: title += ', skipped REJECT'
    if Params.SKIP_NOCALL: title += ', skipped num called = 0'
    if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF)
    if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST)
    if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions'
    else: title += ', skiped SNP pairs between regions'

    run_id = '__'.join(d.name for d in datasets)

    run_dir = safe_mkdir(join((output_dir or join(basedir, 'runs')), run_id))
    log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True)
    work_dir = safe_mkdir(join(run_dir, 'work'))

    all_vcf_by_label = dict()
    bed_files_by_genome = defaultdict(set)
    for d in datasets:
        all_vcf_by_label.update(d.vcf_by_label)
        bed_files_by_genome[d.genome].add(
            d.bed_file)  # d.bed_file=None for WGS

    genome_by_label = dict()
    for d in datasets:
        for label in d.vcf_by_label:
            genome_by_label[label] = d.genome

    parallel_cfg = ParallelCfg(threads=threads)
    log.info(f'Starting using {parallel_cfg.threads} threads')

    overlap_bed_file_by_genome = dict()
    with parallel_view(len(all_vcf_by_label), parallel_cfg,
                       work_dir) as parall_view:
        if bed_files_by_genome:
            log.info(f'Found BED files: {bed_files_by_genome}')
            for genome, bed_files in bed_files_by_genome.items():
                bed_files = [b for b in bed_files if b]
                log.info(f'Overlapping BED files for genome {genome}')
                overlap_bed_file_by_genome[genome] = _overlap_bed_files(bed_files, work_dir, genome) \
                    if bed_files else None

            primary_genome = sorted(bed_files_by_genome.items(),
                                    key=lambda kv: len(kv[1]))[-1][0]
            lifted_bed_files = []
            for genome, overlap_bed_file in overlap_bed_file_by_genome.items():
                if overlap_bed_file and genome != primary_genome:
                    lifted_bed_file = lift_over(overlap_bed_file, genome,
                                                primary_genome)
                    lifted_bed_files.append(lifted_bed_file)
            if lifted_bed_files:
                primary_bed_files = [
                    b for b in lifted_bed_files +
                    [overlap_bed_file_by_genome[primary_genome]] if b
                ]
                overlap_bed_file_by_genome[
                    primary_genome] = _overlap_bed_files(
                        primary_bed_files, work_dir, primary_genome)

            log.info('Lifting BED files back')
            for genome in overlap_bed_file_by_genome:
                if genome != primary_genome:
                    overlap_bed_file_by_genome[genome] = lift_over(
                        overlap_bed_file_by_genome[primary_genome],
                        primary_genome, genome)
            log.info()

            log.info('Sorting, bgzipping and tabixing BED files')
            for g, bed in overlap_bed_file_by_genome.items():
                overlap_bed_file_by_genome[g] = bgzip_and_tabix(
                    sort_bed(bed, genome=g))
            log.info()

            log.info('Slicing VCFs to regions in BED files')
            out = parall_view.run(_slice_vcf_fn, [[
                work_dir, label, vcf,
                overlap_bed_file_by_genome.get(genome_by_label[label])
            ] for label, vcf in all_vcf_by_label.items()])
            all_vcf_by_label = dict(out)
            log.info()

        log.info('Calculating fingerprints for individual samples')
        out = parall_view.run(make_fingerprint, [[
            vcf, work_dir, label, fp_size,
            overlap_bed_file_by_genome[genome_by_label[label]]
        ] for label, vcf in all_vcf_by_label.items()])
        print_label_pairs = dict(out)
        log.info()

    log.info('Comparing fingerprints pairwise')
    pairwise_dict = defaultdict(dict)
    for ((label1, print1), (label2,
                            print2)) in it.combinations_with_replacement(
                                print_label_pairs.items(), 2):
        dist, pvalue = compare(print1, print2)
        if dist:
            log.info(
                f'   {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}')
        else:
            log.info(f'   {label1} VS {label2}: failed to calculate')
            dist = float('NaN')
        pairwise_dict[label1][label2] = dist
        pairwise_dict[label2][label1] = dist

    log.info('Plotting comparison heatmap')
    plot_heatmap(pairwise_dict, run_dir, title)
Exemple #12
0
def main(prefix,
         output_bedpe,
         output_fasta=None,
         output_json=None,
         support=None,
         ensembl_release=None,
         peptide_flanking_len=None,
         debug=False):

    pizzly_flat_filt_fpath = prefix + '-flat-filtered.tsv'
    pizzly_json_fpath = prefix + '.json'
    input_fasta = prefix + '.fusions.fasta'
    output_bedpe = abspath(output_bedpe)

    logger.init(debug)

    ebl = EnsemblRelease(ensembl_release)

    # Reading filtered tsv
    filt_fusions = set()
    with open(pizzly_flat_filt_fpath) as f:
        for row in csv.DictReader(f, delimiter='\t'):
            filt_fusions.add((row['geneA.name'], row['geneB.name']))

    # Read json
    json_data = {'genes': []}
    with open(pizzly_json_fpath) as f:
        data = json.load(f)
        for g_event in data['genes']:
            gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
            if (gene_a, gene_b) in filt_fusions:
                json_data['genes'].append(g_event)

    # Read fasta
    fasta_dict = SeqIO.index(input_fasta, 'fasta')

    filt_json_data = {'genes': []}
    filt_fasta_records = []
    filt_event_count = 0
    filt_transcript_event_count = 0

    # Write bedpe
    with open(output_bedpe, 'w') as bedpe_fh:
        bedpe_header = [
            'chr 5p',
            'start 5p',
            'end 5p',
            'chr 3p',
            'start 3p',
            'end 3p',
            'name',
            'tier',
            'strand 5p',
            'strand 3p',
            'support',
            'is canon bndry',
            'inframe',
            'peptide',
            'fusion pos',
            'nt in the break',
            'transcripts',
            'is canon intron dinuc',
        ]
        bedpe_writer = csv.DictWriter(bedpe_fh,
                                      fieldnames=bedpe_header,
                                      delimiter='\t')
        bedpe_writer.writeheader()

        for g_event in json_data[
                'genes']:  # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'}
            gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name']
            logger.info(gene_a + '>>' + gene_b)

            # # first pass to select the longest transcripts
            # def _longest_tx(key):
            #     return max((ebl.transcript_by_id(te[f'transcript{key}']['id']) for te in g_event['transcripts']), key=lambda t: len(t))
            # a_tx = _longest_tx('A')
            # b_tx = _longest_tx('B')
            # print(f'Longest transcriptA: {a_tx.id}, Longest transcriptB: {b_tx.id}')
            # try:
            #     t_event = [te for te in g_event['transcripts'] if te['transcriptA']['id'] == a_tx.id and te['transcriptB']['id'] == b_tx.id][0]
            # except:
            #     print(f"No event with 2 longest transcripts. Available events: {', '.join(te['transcriptA']['id'] +
            #           '>>' + te['transcriptB']['id'] for te in g_event['transcripts'])}")
            #     raise

            filt_g_event = {
                k: v
                for k, v in g_event.items() if k != 'readpairs'
            }
            filt_g_event['transcripts'] = []

            met_event_keys = set(
            )  # collecting to get rid of duplicate transcript events
            met_peptide_keys = set(
            )  # collecting to get rid of duplicate peptides
            bedpe_entries = []
            for t_event in g_event['transcripts']:
                if t_event['support'] < support:
                    continue

                fusion = Fusion.create_from_pizzly_event(ebl, t_event)
                if not fusion:  # not a good transcript
                    continue

                # skipping duplicate events
                k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.side_5p.bp_offset, fusion.side_3p.bp_offset
                if k in met_event_keys: continue
                met_event_keys.add(k)

                # for writing filtered json
                filt_g_event['transcripts'].append(t_event)
                filt_transcript_event_count += 1

                # writing bedpe
                entry = fusion.to_bedpe(peptide_flanking_len)
                if not entry:
                    continue

                # skipping duplicate peptides
                k = entry['name'], entry['peptide']
                if k in met_peptide_keys: continue
                met_peptide_keys.add(k)

                bedpe_entries.append(entry)

                # for writing filtered fasta
                pizzly_fasta_rec = fasta_dict[t_event['fasta_record']]
                _check_fusion_fasta(pizzly_fasta_rec, fusion)
                filt_fasta_records.append(pizzly_fasta_rec)

                if fusion.peptide:
                    _verify_peptides(pizzly_fasta_rec, fusion,
                                     peptide_flanking_len)

            if not bedpe_entries:
                logger.warn(
                    f'All transcript events filtered out for fusion {gene_a}>>{gene_b}, skipping'
                )
            else:
                filt_json_data['genes'].append(filt_g_event)
                filt_event_count += 1
                for bedpe_entry in bedpe_entries:
                    bedpe_writer.writerow(bedpe_entry)

    # _test_pvac(output_bedpe)

    # Write filtered json
    if output_json:
        with open(output_json, 'w') as f:
            json.dump(filt_json_data, f, indent=4)

    # Write fasta
    if output_fasta:
        SeqIO.write(filt_fasta_records, output_fasta, 'fasta')

    logger.info()
    logger.info(f'Written {filt_transcript_event_count} transcript events '
                f'for {filt_event_count} fusions into bedpe: {output_bedpe}')