def main(bcbio_dir, bed, depth, threads=None, isdebug=True): snp_file = verify_file(bed) depth_cutoff = depth log.init(isdebug) try: import az except ImportError: parallel_cfg = ParallelCfg(threads=threads) else: sys_cfg = az.init_sys_cfg() parallel_cfg = ParallelCfg( scheduler=sys_cfg.get('scheduler'), queue=sys_cfg.get('queue'), resources=sys_cfg.get('resources'), threads=threads or sys_cfg.get('threads'), tag='clearup') log.info('Loading bcbio project from ' + bcbio_dir) log.info('-' * 70) proj = BcbioProject() proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup') log.info('Loaded ' + proj.final_dir) log_dir = safe_mkdir(join(proj.log_dir, 'clearup')) work_dir = safe_mkdir(join(proj.work_dir, 'clearup')) out_dir = safe_mkdir(join(proj.date_dir, 'clearup')) with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view: genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
def main(paths, output_dir, genome, depth): log.init(True) bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)] bcbio_projs = [] dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)] if dirs: for d in dirs: proj = BcbioProject() proj.load_from_bcbio_dir(d, proc_name='clearup') bcbio_projs.append(proj) build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
def main(host, port): clearup.HOST_IP = host clearup.POST = port log.init(True, join(DATA_DIR, 'log_server.txt'), save_previous=True) os.environ['FLASK_DEBUG'] = '1' # log_path = join(DATA_DIR, 'flask.log') # handler = RotatingFileHandler(log_path, maxBytes=10000, backupCount=10) # handler.setLevel(logging.INFO) # app.logger.addHandler(handler) http_server = WSGIServer((host, port), app, handler_class=WebSocketHandler) log.info('Starting a webserver at ' + host + ':' + str(port)) http_server.serve_forever()
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter('--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate( input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
from flask_script import Manager from ngs_utils.file_utils import safe_mkdir, file_transaction, intermediate_fname, can_reuse, verify_dir from ngs_utils.utils import is_local, is_us, is_uk from ngs_utils import logger as log, call_process from ngs_utils.parallel import ParallelCfg, parallel_view from ngs_utils.bcbio import BcbioProject from clearup.panel import get_dbsnp from clearup.callable import batch_callable_bed from clearup.model import Project, Sample, db, SNP, get_or_create_run, Run from clearup import app, DATA_DIR, parallel_cfg, DEPTH_CUTOFF from clearup.utils import bam_samplename, get_ref_fasta manager = Manager(app) log.init(True) def _add_project(bam_by_sample, project_name, bed_file=None, use_callable=False, data_dir='', genome='hg19', min_depth=DEPTH_CUTOFF, depth_by_sample=None, reuse_files=False): fp_proj = Project.query.filter(Project.name == project_name).first() if fp_proj: fp_proj.delete(reuse_files=reuse_files)
def main(output_dir=None, tumor_bam=None, normal_bam=None, normal_name=None, tumor_name=None, genome=None, input_genomes_url=None, ref_fa=None, viruses_fa=None, repeat_masker_bed=None, breakend_pon=None, bp_pon=None, bp_hotspots=None, min_tumor_af=None, requested_cores=None, unlock=False, dryrun=False, maxcoverage=None, chunksize_mil=None, jvm_heap=None, externalaligner=None): conf = {} output_dir = output_dir or 'gridss_results' output_dir = safe_mkdir(abspath(output_dir)) log_dir = safe_mkdir(join(output_dir, 'log')) logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True) if isfile(join(output_dir, 'work', 'all.done')): run_simple('rm ' + join(output_dir, 'work', 'all.done')) conf['output_dir'] = adjust_path(output_dir) tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') if normal_bam: normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'), conf['normal_name'] = normal_name conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option') conf['tumor_name'] = tumor_name try: machine_cores = len(os.sched_getaffinity(0)) except: machine_cores = 1 cores = min(machine_cores, 8) if requested_cores: cores = min(cores, requested_cores) conf['cores'] = cores if maxcoverage: conf['maxcoverage'] = maxcoverage if chunksize_mil: conf['chunksize_mil'] = chunksize_mil if jvm_heap: conf['jvm_heap'] = jvm_heap if externalaligner: conf['externalaligner'] = externalaligner conf['genome'] = genome try: from reference_data import api as refdata except: pass else: # check reference_data can find the genomes dir, and error out if not genomes_dir = refdata.find_genomes_dir(input_genomes_url) if genomes_dir: conf['genomes_dir'] = genomes_dir if ref_fa: if not externalaligner == 'minimap2' and not verify_file(ref_fa + '.bwt'): log.critical(f'Please, index {ref_fa} using' f' bwa index {ref_fa}') if not verify_file(ref_fa + '.fai'): log.critical(f'Please, index {ref_fa} using' f' samtools faidx {ref_fa}') conf['ref_fa'] = ref_fa if viruses_fa: if not externalaligner == 'minimap2' and not verify_file(viruses_fa + '.bwt'): log.critical(f'Please, index {viruses_fa} using: ' f' bwa index {viruses_fa}') if not verify_file(viruses_fa + '.fai'): log.critical(f'Please, index {viruses_fa} using ' f' samtools faidx {viruses_fa}') dict_file = viruses_fa.replace('.fa', '.dict') if not verify_file(dict_file): log.critical(f'Please, index {viruses_fa} using: ' f' samtools dict {viruses_fa} -o {dict_file}') img_file = viruses_fa + '.img' if not verify_file(img_file): log.critical( f'Please, create an img file for {viruses_fa} using:\n' f' gatk BwaMemIndexImageCreator -I {viruses_fa} -O {img_file}' ) conf['viruses_fa'] = verify_file(viruses_fa) if repeat_masker_bed: conf['repeat_masker_bed'] = repeat_masker_bed if breakend_pon: conf['breakend_pon'] = breakend_pon if bp_pon: conf['bp_pon'] = bp_pon if bp_hotspots: conf['bp_hotspots'] = bp_hotspots if min_tumor_af: conf['min_tumor_af'] = min_tumor_af py_path = sys.executable # e.g. /miniconda/envs/umccrise_hmf/bin/python env_path = dirname(dirname(py_path)) # e.g. /miniconda/envs/umccrise_hmf found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar')) if not found: hmf_env_path = secondary_conda_env('hmf', is_critical=False) if hmf_env_path: found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar')) if not found: critical( 'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`' ) conf['gridss_env'] = hmf_env_path conf['gridss_jar'] = found[0] run_snakemake(join(package_path(), 'gridss', 'Snakefile'), conf, cores=cores, output_dir=output_dir, unlock=unlock, dryrun=dryrun)
def main(ctx, subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False): """ Generates a PNG image with a relatedness heatmap. """ if not subdirs: ctx.fail('Provide at least on input directory.') datasets = _load_datasets(subdirs) title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + '' if not Params.NORMALIZE_VAR: title += ', not norm by var' if not Params.NORMALIZE_DIST: title += ', not norm by dist' if Params.SKIP_DAMAGE: title += ', skipped damage' if Params.SKIP_REJECT: title += ', skipped REJECT' if Params.SKIP_NOCALL: title += ', skipped num called = 0' if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF) if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST) if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions' else: title += ', skiped SNP pairs between regions' run_id = '__'.join(d.name for d in datasets) run_dir = safe_mkdir(join((output_dir or join(code_dir, 'runs')), run_id)) log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True) work_dir = safe_mkdir(join(run_dir, 'work')) all_vcf_by_label = dict() bed_files_by_genome = defaultdict(set) for d in datasets: all_vcf_by_label.update(d.vcf_by_label) if d.bed_file: bed_files_by_genome[d.genome].add( d.bed_file) # d.bed_file=None for WGS genome_by_label = dict() for d in datasets: for label in d.vcf_by_label: genome_by_label[label] = d.genome parallel_cfg = ParallelCfg(threads=threads) log.info(f'Starting using {parallel_cfg.threads} threads') with parallel_view(len(all_vcf_by_label), parallel_cfg, work_dir) as parall_view: overlap_bed_file_by_genome = dict() if bed_files_by_genome: overlap_bed_file_by_genome = _prep_bed(work_dir, bed_files_by_genome, overlap_bed_file_by_genome) log.info('Slicing VCFs to regions in BED files') out = parall_view.run(_slice_vcf_fn, [[ work_dir, label, vcf, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) all_vcf_by_label = dict(out) log.info() log.info('Calculating fingerprints for individual samples') out = parall_view.run(make_fingerprint, [[ vcf, work_dir, label, fp_size, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) print_label_pairs = dict(out) log.info() log.info('Comparing fingerprints pairwise') pairwise_dict = defaultdict(dict) for ((label1, print1), (label2, print2)) in it.combinations_with_replacement( print_label_pairs.items(), 2): dist, pvalue = compare(print1, print2) if dist: log.info( f' {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}') else: log.info(f' {label1} VS {label2}: failed to calculate') dist = float('NaN') pairwise_dict[label1][label2] = dist pairwise_dict[label2][label1] = dist log.info('Plotting comparison heatmap') plot_heatmap(pairwise_dict, run_dir, title)
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter( 'Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter( '--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter( '--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter( f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed( input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate(input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
def main(output_dir=None, normal_bam=None, tumor_bam=None, snv_vcf=None, normal_name=None, tumor_name=None, sample=None, genome=None, genomes_dir=None, gridss_ref_dir=None, ref_fa=None, threads=None, jvmheap=None): gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx')) gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts')) normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') sample = sample or tumor_name output_dir = safe_mkdir(abspath(output_dir or 'gridss')) logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True) output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf') assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet' if genomes_dir: refdata.find_genomes_dir(genomes_dir) if not gridss_ref_dir: gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir') if not ref_fa: ref_fa = ref_fa.get_ref_file(genome, 'fa') hmf_env_path = conda_utils.secondary_conda_env('hmf') gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0] amber_jar = glob.glob( join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0] cobalt_jar = glob.glob( join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0] purple_jar = glob.glob( join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0] linx_jar = glob.glob( join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0] cmd = f""" PATH={hmf_env_path}/bin:$PATH \ THREADS={threads} \ GRIDSS_JAR={gridss_jar} \ AMBER_JAR={amber_jar} \ COBALT_JAR={cobalt_jar} \ PURPLE_JAR={purple_jar} \ LINX_JAR={linx_jar} \ bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \ -n {normal_bam} \ -t {tumor_bam} \ -v {output_vcf} \ -s {sample} \ --normal_sample {normal_name} \ --tumour_sample {tumor_name} \ --snvvcf {snv_vcf} \ --ref_dir {gridss_ref_dir} \ --install_dir {gridss_scripts_dir} \ --reference {ref_fa} \ --output_dir {output_dir} \ {f"--jvmheap {jvmheap}" if jvmheap else ""} """.strip() try: run_simple(cmd) except subprocess.SubprocessError: err('--------\n') err(f'Error running GRIDSS-PURPLE-LINX.\n') raise
def main(prefix, output_bedpe, output_fasta=None, output_json=None, min_read_support=None, ensembl_release=None, peptide_flanking_len=None, debug=False, no_filtering=False, check_transcript=True, pizzly_ref_fa=None, reads=None, min_tpm=None): # input_flat_fpath = prefix + '-flat.tsv' input_json_fpath = prefix + '.json' input_fasta = prefix + '.fusions.fasta' output_bedpe = abspath(output_bedpe) logger.init(debug) global ENSEMBL_RELEASE ENSEMBL_RELEASE = ensembl_release ebl = EnsemblRelease(ENSEMBL_RELEASE) # Reading filtered tsv # filt_fusions = set() # with open(input_flat_fpath) as f: # for row in csv.DictReader(f, delimiter='\t'): # filt_fusions.add((row['geneA.name'], row['geneB.name'])) # Read json json_data = {'genes': []} with open(input_json_fpath) as f: data = json.load(f) for g_event in data['genes']: gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] # if (gene_a, gene_b) in filt_fusions: json_data['genes'].append(g_event) # Read fasta fasta_dict = SeqIO.index(input_fasta, 'fasta') # First round: genomic coordinates and fasta logger.info( f'Round 1: reading {len(json_data["genes"])} gene-pairs events from pizzly JSON' ) fusions = [] for g_event in json_data[ 'genes']: # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'} gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] # logger.info(f'Processing event {gene_a}>>{gene_b}') met_fasta_keys = set( ) # collecting to get rid of duplicate transcript events for t_event in g_event['transcripts']: fusion = Fusion.create_from_pizzly_event(ebl, t_event) if check_transcript: if not _transcript_is_good( fusion.side_5p.trx) or not _transcript_is_good( fusion.side_3p.trx): # logger.info(f'Transcripts {fusion.side_5p.trx} and {fusion.side_3p.trx} didn\'t pass check') continue if no_filtering is not True and fusion.support < min_read_support: continue calc_positions_ok = fusion.calc_genomic_positions() if not calc_positions_ok: continue # comparing our fasta to pizzly fasta fusion.fasta_rec = fasta_dict[t_event['fasta_record']] _check_fusion_fasta(fusion.fasta_rec, fusion) # skipping duplicate fastas k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.fasta assert k not in met_fasta_keys met_fasta_keys.add(k) fusions.append(fusion) # if not met_fasta_keys: # logger.info(' Filtered all fusions for this gene pair.') if met_fasta_keys: logger.info( f'Keeping {len(met_fasta_keys)} fusion(s) for the event {gene_a}-{gene_b}' ) if not fusions: logger.warn('Finished: no fusions passed filtering') # Calculate expression of fused transcripts expr_by_fusion = None if reads and fusions: # filtered fasta for re-calling expression work_dir = safe_mkdir(splitext(output_bedpe)[0] + '_quant') fasta_path = join(work_dir, 'fusions.fasta') fasta_recs = [f.fasta_rec for f in fusions] SeqIO.write(fasta_recs, fasta_path, 'fasta') if pizzly_ref_fa: expr_by_fusion = requanitify_pizzly(pizzly_ref_fa, fasta_path, work_dir, reads) # expr_by_fusion = {fusion-fasta-id -> {length eff_length est_counts tpm}} # Second round: peptides and expression logger.info() logger.info( f'Round 2: making peptides for {len(fusions)} events in ' f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in fusions]))} genes pairs' ) met_peptide_keys = set() # collecting to get rid of duplicate peptides bedpe_entries = [] peptide_fusions = [] if peptide_flanking_len < 0: peptide_flanking_len = None for fusion in fusions: if fusion.side_3p.trx.contains_start_codon: logger.info( f'Translating {fusion.side_5p.trx.gene.name}>>{fusion.side_3p.trx.gene.name} fusion: {fusion}' ) fusion.make_peptide(peptide_flanking_len) if fusion.peptide: _verify_peptides(fusion.fasta_rec, fusion, peptide_flanking_len) # skipping duplicate peptides k = fusion.side_5p.trx.gene.name, fusion.side_3p.trx.gene.name, fusion.peptide if k in met_peptide_keys: logger.debug(f'Skipping peptide {k}: already added') continue met_peptide_keys.add(k) # writing bedpe entry = fusion.to_bedpe() # add expression if expr_by_fusion: entry.update(expr_by_fusion[fusion.fasta_rec.id]) tpm = float(entry['tpm']) if no_filtering is not True and tpm < min_tpm: logger.debug( f'Skipping peptide {entry}: TPM={tpm} is below {min_tpm}') continue if fusion.peptide: peptide_fusions.append(fusion) bedpe_entries.append(entry) # Writing bedpe with open(output_bedpe, 'w') as bedpe_fh: bedpe_header = [ 'chr 5p', 'start 5p', 'end 5p', 'chr 3p', 'start 3p', 'end 3p', 'name', 'tier', 'strand 5p', 'strand 3p', 'support', 'is canon bndry', 'inframe', 'peptide', 'fusion pos', 'nt in the break', 'transcripts', 'is canon intron dinuc', ] if expr_by_fusion: bedpe_header.extend(list(expr_by_fusion.values())[0].keys()) bedpe_writer = csv.DictWriter(bedpe_fh, fieldnames=bedpe_header, delimiter='\t') bedpe_writer.writeheader() for bedpe_entry in bedpe_entries: bedpe_writer.writerow(bedpe_entry) # _test_pvac(output_bedpe) # Write fasta if output_fasta: SeqIO.write([f.fasta_rec for f in peptide_fusions], output_fasta, 'fasta') logger.info() logger.info( f'Written {len(peptide_fusions)} fusions in ' f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in peptide_fusions]))} ' f'gene pairs good peptides bedpe: {output_bedpe}')
def main(subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False): """ Generates a PNG image with a relatedness heatmap. """ datasets = _load_datasets(subdirs) title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + '' if not Params.NORMALIZE_VAR: title += ', not norm by var' if not Params.NORMALIZE_DIST: title += ', not norm by dist' if Params.SKIP_DAMAGE: title += ', skipped damage' if Params.SKIP_REJECT: title += ', skipped REJECT' if Params.SKIP_NOCALL: title += ', skipped num called = 0' if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF) if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST) if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions' else: title += ', skiped SNP pairs between regions' run_id = '__'.join(d.name for d in datasets) run_dir = safe_mkdir(join((output_dir or join(basedir, 'runs')), run_id)) log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True) work_dir = safe_mkdir(join(run_dir, 'work')) all_vcf_by_label = dict() bed_files_by_genome = defaultdict(set) for d in datasets: all_vcf_by_label.update(d.vcf_by_label) bed_files_by_genome[d.genome].add( d.bed_file) # d.bed_file=None for WGS genome_by_label = dict() for d in datasets: for label in d.vcf_by_label: genome_by_label[label] = d.genome parallel_cfg = ParallelCfg(threads=threads) log.info(f'Starting using {parallel_cfg.threads} threads') overlap_bed_file_by_genome = dict() with parallel_view(len(all_vcf_by_label), parallel_cfg, work_dir) as parall_view: if bed_files_by_genome: log.info(f'Found BED files: {bed_files_by_genome}') for genome, bed_files in bed_files_by_genome.items(): bed_files = [b for b in bed_files if b] log.info(f'Overlapping BED files for genome {genome}') overlap_bed_file_by_genome[genome] = _overlap_bed_files(bed_files, work_dir, genome) \ if bed_files else None primary_genome = sorted(bed_files_by_genome.items(), key=lambda kv: len(kv[1]))[-1][0] lifted_bed_files = [] for genome, overlap_bed_file in overlap_bed_file_by_genome.items(): if overlap_bed_file and genome != primary_genome: lifted_bed_file = lift_over(overlap_bed_file, genome, primary_genome) lifted_bed_files.append(lifted_bed_file) if lifted_bed_files: primary_bed_files = [ b for b in lifted_bed_files + [overlap_bed_file_by_genome[primary_genome]] if b ] overlap_bed_file_by_genome[ primary_genome] = _overlap_bed_files( primary_bed_files, work_dir, primary_genome) log.info('Lifting BED files back') for genome in overlap_bed_file_by_genome: if genome != primary_genome: overlap_bed_file_by_genome[genome] = lift_over( overlap_bed_file_by_genome[primary_genome], primary_genome, genome) log.info() log.info('Sorting, bgzipping and tabixing BED files') for g, bed in overlap_bed_file_by_genome.items(): overlap_bed_file_by_genome[g] = bgzip_and_tabix( sort_bed(bed, genome=g)) log.info() log.info('Slicing VCFs to regions in BED files') out = parall_view.run(_slice_vcf_fn, [[ work_dir, label, vcf, overlap_bed_file_by_genome.get(genome_by_label[label]) ] for label, vcf in all_vcf_by_label.items()]) all_vcf_by_label = dict(out) log.info() log.info('Calculating fingerprints for individual samples') out = parall_view.run(make_fingerprint, [[ vcf, work_dir, label, fp_size, overlap_bed_file_by_genome[genome_by_label[label]] ] for label, vcf in all_vcf_by_label.items()]) print_label_pairs = dict(out) log.info() log.info('Comparing fingerprints pairwise') pairwise_dict = defaultdict(dict) for ((label1, print1), (label2, print2)) in it.combinations_with_replacement( print_label_pairs.items(), 2): dist, pvalue = compare(print1, print2) if dist: log.info( f' {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}') else: log.info(f' {label1} VS {label2}: failed to calculate') dist = float('NaN') pairwise_dict[label1][label2] = dist pairwise_dict[label2][label1] = dist log.info('Plotting comparison heatmap') plot_heatmap(pairwise_dict, run_dir, title)
def main(prefix, output_bedpe, output_fasta=None, output_json=None, support=None, ensembl_release=None, peptide_flanking_len=None, debug=False): pizzly_flat_filt_fpath = prefix + '-flat-filtered.tsv' pizzly_json_fpath = prefix + '.json' input_fasta = prefix + '.fusions.fasta' output_bedpe = abspath(output_bedpe) logger.init(debug) ebl = EnsemblRelease(ensembl_release) # Reading filtered tsv filt_fusions = set() with open(pizzly_flat_filt_fpath) as f: for row in csv.DictReader(f, delimiter='\t'): filt_fusions.add((row['geneA.name'], row['geneB.name'])) # Read json json_data = {'genes': []} with open(pizzly_json_fpath) as f: data = json.load(f) for g_event in data['genes']: gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] if (gene_a, gene_b) in filt_fusions: json_data['genes'].append(g_event) # Read fasta fasta_dict = SeqIO.index(input_fasta, 'fasta') filt_json_data = {'genes': []} filt_fasta_records = [] filt_event_count = 0 filt_transcript_event_count = 0 # Write bedpe with open(output_bedpe, 'w') as bedpe_fh: bedpe_header = [ 'chr 5p', 'start 5p', 'end 5p', 'chr 3p', 'start 3p', 'end 3p', 'name', 'tier', 'strand 5p', 'strand 3p', 'support', 'is canon bndry', 'inframe', 'peptide', 'fusion pos', 'nt in the break', 'transcripts', 'is canon intron dinuc', ] bedpe_writer = csv.DictWriter(bedpe_fh, fieldnames=bedpe_header, delimiter='\t') bedpe_writer.writeheader() for g_event in json_data[ 'genes']: # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'} gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] logger.info(gene_a + '>>' + gene_b) # # first pass to select the longest transcripts # def _longest_tx(key): # return max((ebl.transcript_by_id(te[f'transcript{key}']['id']) for te in g_event['transcripts']), key=lambda t: len(t)) # a_tx = _longest_tx('A') # b_tx = _longest_tx('B') # print(f'Longest transcriptA: {a_tx.id}, Longest transcriptB: {b_tx.id}') # try: # t_event = [te for te in g_event['transcripts'] if te['transcriptA']['id'] == a_tx.id and te['transcriptB']['id'] == b_tx.id][0] # except: # print(f"No event with 2 longest transcripts. Available events: {', '.join(te['transcriptA']['id'] + # '>>' + te['transcriptB']['id'] for te in g_event['transcripts'])}") # raise filt_g_event = { k: v for k, v in g_event.items() if k != 'readpairs' } filt_g_event['transcripts'] = [] met_event_keys = set( ) # collecting to get rid of duplicate transcript events met_peptide_keys = set( ) # collecting to get rid of duplicate peptides bedpe_entries = [] for t_event in g_event['transcripts']: if t_event['support'] < support: continue fusion = Fusion.create_from_pizzly_event(ebl, t_event) if not fusion: # not a good transcript continue # skipping duplicate events k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.side_5p.bp_offset, fusion.side_3p.bp_offset if k in met_event_keys: continue met_event_keys.add(k) # for writing filtered json filt_g_event['transcripts'].append(t_event) filt_transcript_event_count += 1 # writing bedpe entry = fusion.to_bedpe(peptide_flanking_len) if not entry: continue # skipping duplicate peptides k = entry['name'], entry['peptide'] if k in met_peptide_keys: continue met_peptide_keys.add(k) bedpe_entries.append(entry) # for writing filtered fasta pizzly_fasta_rec = fasta_dict[t_event['fasta_record']] _check_fusion_fasta(pizzly_fasta_rec, fusion) filt_fasta_records.append(pizzly_fasta_rec) if fusion.peptide: _verify_peptides(pizzly_fasta_rec, fusion, peptide_flanking_len) if not bedpe_entries: logger.warn( f'All transcript events filtered out for fusion {gene_a}>>{gene_b}, skipping' ) else: filt_json_data['genes'].append(filt_g_event) filt_event_count += 1 for bedpe_entry in bedpe_entries: bedpe_writer.writerow(bedpe_entry) # _test_pvac(output_bedpe) # Write filtered json if output_json: with open(output_json, 'w') as f: json.dump(filt_json_data, f, indent=4) # Write fasta if output_fasta: SeqIO.write(filt_fasta_records, output_fasta, 'fasta') logger.info() logger.info(f'Written {filt_transcript_event_count} transcript events ' f'for {filt_event_count} fusions into bedpe: {output_bedpe}')