def simulate_read(self): """Function that simulates perfect paired-end reads""" fastafile = ps.FastaFile(self.genome_fa) # left split read insert = int( np.random.normal(self.insert_size, (self.insert_size / 12), 1)) start = int( np.random.randint(self.chr_pos_start, (self.chr_pos_end + 1))) left_end = start + self.read_length total_end = start + int(np.round(insert)) right_start = total_end - self.read_length if total_end > self.chr_pos_end: # split read scenario or insert spanning split read scenario if left_end > self.chr_pos_end: # left read spanning split read scenario # left_read left_dntps = self.chr_pos_end - start right_dntps = self.read_length - left_dntps # the error could be here left_split_read = fastafile.fetch(self.chr, start, self.chr_pos_end) right_split_read = fastafile.fetch( self.chr, self.chr_pos_start, (self.chr_pos_start + right_dntps)) left_read = left_split_read + right_split_read # right_read right_start = self.chr_pos_start + int( round(self.insert_size - left_dntps - self.read_length)) right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) # assertion to check the error here common_id = "%s|%s|%s:%s-%s:%s|%s:%s|1|%s" % ( self.read_number, self.chr, start, self.chr_pos_end, self.chr_pos_start, (self.chr_pos_start + right_dntps), right_start, (right_start + self.read_length), self.circle_id) else: if right_start > self.chr_pos_end: # insert spanning split read scenario left_read = fastafile.fetch(self.chr, start, (start + self.read_length)) right_start = self.chr_pos_start + (right_start - self.chr_pos_end) right_read = fastafile.fetch( self.chr, right_start, (right_start + self.read_length)) common_id = "%s|%s|%s:%s|%s:%s|3|%s" % ( self.read_number, self.chr, start, (start + self.read_length), right_start, (right_start + self.read_length), self.circle_id) else: # right split read scenario assert right_start <= self.chr_pos_end assert (right_start + self.read_length) > self.chr_pos_end left_read = fastafile.fetch(self.chr, start, (start + self.read_length)) # compute right dntps left_dntps = self.chr_pos_end - right_start right_dntps = self.read_length - left_dntps left_split_read = fastafile.fetch(self.chr, right_start, self.chr_pos_end) right_split_read = fastafile.fetch( self.chr, self.chr_pos_start, (self.chr_pos_start + right_dntps)) right_read = left_split_read + right_split_read common_id = "%s|%s|%s:%s|%s:%s-%s:%s|2|%s" % ( self.read_number, self.chr, start, (start + self.read_length), right_start, self.chr_pos_end, self.chr_pos_start, (self.chr_pos_start + right_dntps), self.circle_id) else: # non split read scenario left_read = fastafile.fetch(self.chr, start, (start + self.read_length)) # correct right read start right_read = fastafile.fetch(self.chr, right_start, (right_start + self.read_length)) common_id = "%s|%s|%s:%s|%s:%s|0|%s" % ( self.read_number, self.chr, start, (start + self.read_length), right_start, (right_start + self.read_length), self.circle_id) return (right_read, left_read, common_id)
gsnv_pos) in enumerate(probed_variants.items()): arguments = " ".join([ x for x in sys.argv if x != '--cluster' and '.bed' not in x and '-ssnv' != x and '-gsnv' != x ]) job_name = f'vstat_{i}' out_folder = './variantStats' if not os.path.exists(out_folder): os.makedirs(out_folder) print( 'submission.py' + f' -y --py36 -time 50 -t 1 -m 50 -N {job_name} "{arguments} -ssnv {chrom}:{snv_pos} -gsnv {chrom}:{gsnv_pos} -prefix {out_folder}/{chrom}_{snv_pos}" ' ) exit() reference = pysamiterators.CachedFasta(pysam.FastaFile(args.reference)) cell_obs = collections.defaultdict( lambda: collections.defaultdict(collections.Counter)) statistics = collections.defaultdict( lambda: collections.defaultdict(collections.Counter)) cell_call_data = collections.defaultdict(dict) # location->cell->haplotype haplotype_scores = {} read_groups = set() # Store unique read groups in this set with sorted_bam_file(f'{args.prefix}_evidence.bam', origin_bam=pysam.AlignmentFile( paths[0], ignore_truncation=args.ignore_bam_issues), read_groups=read_groups) as out:
def run_tfbscan(args): ###### Check input arguments ###### check_required(args, ["motifs", "fasta"]) #Check input arguments check_files([args.motifs, args.fasta, args.regions]) #Check if files exist ##Test input if args.outdir != None and args.outfile != None: #Error - both set sys.exit("ERROR: Please choose either --outdir or --outfile") elif ((args.outdir == None or args.outdir != None) and args.outfile == None): #Separate files args.outdir = "tfbscan_output/" if args.outdir == None else args.outdir make_directory(args.outdir) #Check and create output directory elif args.outdir == None and args.outfile != None: #Joined file check_files([args.outfile], "w") ###### Create logger and write argument overview ###### logger = TobiasLogger("TFBScan", args.verbosity) logger.begin() parser = add_tfbscan_arguments(argparse.ArgumentParser()) logger.arguments_overview(parser, args) if args.outfile != None: logger.output_files([args.outfile]) ######## Read sequences from file and estimate background gc ######## logger.info("Handling input files") logger.info("Reading sequences from fasta") fastafile = pysam.FastaFile(args.fasta) fasta_chrom_info = dict(zip(fastafile.references, fastafile.lengths)) fastafile.close() logger.stats("- Found {0} sequences in fasta".format(len(fasta_chrom_info))) #Create regions available in fasta logger.info("Setting up regions") fasta_regions = RegionList([OneRegion([header, 0, fasta_chrom_info[header]]) for header in fasta_chrom_info]) #If subset, setup regions if args.regions: regions = RegionList().from_bed(args.regions) else: #set up regions from fasta references regions = fasta_regions regions = regions.apply_method(OneRegion.split_region, 1000000) regions = regions.apply_method(OneRegion.extend_reg, 50) #extend to overlap at junctions #Clip regions at chromosome boundaries regions = regions.apply_method(OneRegion.check_boundary, fasta_chrom_info, "cut") if len(regions) == 0: logger.error("No regions found.") sys.exit() logger.info("- Total of {0} regions (after splitting)".format(len(regions))) #Background gc if args.gc == None: logger.info("Estimating GC content from fasta (set --gc to skip this step)") args.gc = get_gc_content(regions, args.fasta) logger.info("- GC content: {0}".format(round(args.gc, 5))) bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0]) #Split regions region_chunks = regions.chunks(args.split) #################### Read motifs from file #################### logger.info("Reading motifs from file") motif_list = MotifList().from_file(args.motifs) logger.stats("- Found {0} motifs".format(len(motif_list))) logger.debug("Getting motifs ready") motif_list.bg = bg for motif in motif_list: motif.set_prefix(args.naming) motif.bg = bg motif.get_pssm() motif_names = list(set([motif.prefix for motif in motif_list])) #Calculate scanning-threshold for each motif pool = mp.Pool(processes=args.cores) outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.pvalue])) motif_list = MotifList(outlist) pool.close() pool.join() #################### Find TFBS in regions ##################### logger.comment("") logger.info("Scanning for TFBS with all motifs") manager = mp.Manager() if args.outdir != None: writer_cores = max(1,int(args.cores*0.1)) worker_cores = max(1,args.cores - writer_cores) elif args.outfile != None: #Write to one file writer_cores = 1 worker_cores = max(1,args.cores - writer_cores) #Setup pools logger.debug("Writer cores: {0}".format(writer_cores)) logger.debug("Worker cores: {0}".format(worker_cores)) worker_pool = mp.Pool(processes=worker_cores, maxtasksperchild=1) writer_pool = mp.Pool(processes=writer_cores) #Setup bed-writers based on --outdir or --outfile temp_files = [] qs = {} TF_names_chunks = [motif_names[i::writer_cores] for i in range(writer_cores)] for TF_names_sub in TF_names_chunks: #Skip over any empty chunks if len(TF_names_sub) == 0: continue logger.debug("Creating writer queue for {0}".format(TF_names_sub)) if args.outdir != None: files = [os.path.join(args.outdir, TF + ".tmp") for TF in TF_names_sub] temp_files.extend(files) elif args.outfile != None: files = [args.outfile + ".tmp" for TF in TF_names_sub] #write to the same file for all temp_files.append(files[0]) q = manager.Queue() TF2files = dict(zip(TF_names_sub, files)) logger.debug("TF2files dict: {0}".format(TF2files)) writer_pool.apply_async(file_writer, args=(q, TF2files, args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x))) for TF in TF_names_sub: qs[TF] = q writer_pool.close() #no more jobs applied to writer_pool args.qs = qs #qs is a dict #Setup scanners pool input_arguments = [(chunk, args, motif_list) for chunk in region_chunks] task_list = [worker_pool.apply_async(motif_scanning, (chunk, args, motif_list, )) for chunk in region_chunks] monitor_progress(task_list, logger) results = [task.get() for task in task_list] #1s #Wait for files to write for TF in qs: qs[TF].put((None, None)) writer_pool.join() #Process each file output and write out logger.comment("") logger.info("Processing results from scanning") logger.debug("Running processing for files: {0}".format(temp_files)) task_list = [worker_pool.apply_async(process_TFBS, (file, args)) for file in temp_files] worker_pool.close() monitor_progress(task_list, logger) worker_pool.terminate() results = [task.get() for task in task_list] logger.debug("Joining multiprocessing pools") worker_pool.join() writer_pool.join() logger.end()
def count_transcripts(cargs): args, contig = cargs if args.alleles is not None: allele_resolver = alleleTools.AlleleResolver( args.alleles, lazyLoad=(not args.loadAllelesToMem)) else: allele_resolver = None contig_mapping = None if args.contigmapping == 'danio': contig_mapping = { '1': 'CM002885.2', '2': 'CM002886.2', '3': 'CM002887.2', '4': 'CM002888.2', '5': 'CM002889.2', '6': 'CM002890.2', '7': 'CM002891.2', '8': 'CM002892.2', '9': 'CM002893.2', '10': 'CM002894.2', '11': 'CM002895.2', '12': 'CM002896.2', '13': 'CM002897.2', '14': 'CM002898.2', '15': 'CM002899.2', '16': 'CM002900.2', '17': 'CM002901.2', '18': 'CM002902.2', '19': 'CM002903.2', '20': 'CM002904.2', '21': 'CM002905.2', '22': 'CM002906.2', '23': 'CM002907.2', '24': 'CM002908.2', '25': 'CM002909.2', } # Load features contig_mapping = None #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon) features = singlecellmultiomics.features.FeatureContainer() if contig_mapping is not None: features.remapKeys = contig_mapping features.loadGTF( args.gtfexon, select_feature_type=['exon'], identifierFields=( 'exon_id', 'transcript_id'), store_all=True, head=args.hf, contig=contig) features.loadGTF( args.gtfintron, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, head=args.hf, contig=contig) # What is used for assignment of molecules? if args.method == 'nla': molecule_class = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule fragment_class = singlecellmultiomics.fragment.NlaIIIFragment pooling_method = 1 # all data from the same cell can be dealt with separately stranded = None # data is not stranded elif args.method == 'vasa' or args.method == 'cs': molecule_class = singlecellmultiomics.molecule.VASA fragment_class = singlecellmultiomics.fragment.SingleEndTranscript pooling_method = 1 stranded = 1 # data is stranded, mapping to other strand else: raise ValueError("Supply a valid method") # COUNT: exon_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount intron_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount junction_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount gene_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount gene_set = set() sample_set = set() annotated_molecules = 0 read_molecules = 0 if args.producebam: bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam' with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments: output_bam = pysam.AlignmentFile( bam_path_produced, "wb", header=alignments.header) ref = None if args.ref is not None: ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref)) for alignmentfile_path in args.alignmentfiles: i = 0 with pysam.AlignmentFile(alignmentfile_path) as alignments: molecule_iterator = MoleculeIterator( alignments=alignments, check_eject_every=5000, molecule_class=molecule_class, molecule_class_args={ 'features': features, 'stranded': stranded, 'min_max_mapping_quality': args.minmq, 'reference': ref, 'allele_resolver': allele_resolver }, fragment_class=fragment_class, fragment_class_args={ 'umi_hamming_distance': args.umi_hamming_distance, 'R1_primer_length': 4, 'R2_primer_length': 6}, perform_qflag=True, # when the reads have not been tagged yet, this flag is very # much required pooling_method=pooling_method, contig=contig ) for i, molecule in enumerate(molecule_iterator): if not molecule.is_valid(): if args.producebam: molecule.write_tags() molecule.write_pysam(output_bam) continue molecule.annotate(args.annotmethod) molecule.set_intron_exon_features() if args.producebam: molecule.write_tags() molecule.write_pysam(output_bam) allele = None if allele_resolver is not None: allele = molecule.allele if allele is None: allele = 'noAllele' # Obtain total count introns/exons reduce it so the sum of the # count will be 1: # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions) total_count_for_molecule = len(molecule.genes) if total_count_for_molecule == 0: continue # we didn't find any gene counts # Distibute count over amount of gene hits: count_to_add = 1 / total_count_for_molecule for gene in molecule.genes: if allele is not None: gene = f'{allele}_{gene}' gene_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) sample_set.add(molecule.get_sample()) # Obtain introns/exons/splice junction information: for intron in molecule.introns: gene = intron if allele is not None: gene = f'{allele}_{intron}' intron_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) for exon in molecule.exons: gene = exon if allele is not None: gene = f'{allele}_{exon}' exon_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) for junction in molecule.junctions: gene = junction if allele is not None: gene = f'{allele}_{junction}' junction_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) annotated_molecules += 1 if args.head and (i + 1) > args.head: print( f"-head was supplied, {i} molecules discovered, stopping") break read_molecules += i if args.producebam: output_bam.close() final_bam_path = bam_path_produced.replace('.unsorted', '') sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True) return ( gene_set, sample_set, gene_counts_per_cell, junction_counts_per_cell, exon_counts_per_cell, intron_counts_per_cell, annotated_molecules, read_molecules, contig )
def process_regions(ref_file, regions, out_dir, param_file): out_vcf_path = os.path.join(out_dir, "svteaser.sim.vcf") out_ref_fa_path = os.path.join(out_dir, "svteaser.ref.fa") out_altered_fa_path = os.path.join(out_dir, "svteaser.altered.fa") out_vcf_fh = None out_ref_fa_fh = open(out_ref_fa_path, "w+") out_altered_fa_fh = open(out_altered_fa_path, "w+") ref = pysam.FastaFile(ref_file) # Define padding in reference region where SVs are not to be inserted. padding = 800 for i, (chrom, start, end) in enumerate(regions): # Track status. if (i + 1) % 50 == 0: logging.info("Processed {}/{} regions...".format(i + 1, len(regions))) # Temporary dir. temp_dir = os.path.join(out_dir, "temp") os.mkdir(temp_dir) # Extract ref sequence. name = "{}_{}_{}".format(chrom, start, end) ref_seq = ref.fetch(chrom, start, end) # Remove some buffer from beginning and ending, # so that the tails do not contain SVs. These will be added # back later on. ref_seq_surv = ref_seq[padding:len(ref_seq)-padding] # Write ref sequence to temporary fa file. temp_ref_fa = os.path.join(temp_dir, "temp_ref.fa") with open(temp_ref_fa, "w") as fh: add_fasta_entry(name, ref_seq_surv, fh) # Run SURVIVOR. prefix = os.path.join(temp_dir, "simulated") survivor_cmd = " ".join(["SURVIVOR", "simSV", temp_ref_fa, param_file, "0.0", "0", prefix]) ret = cmd_exe(survivor_cmd) # should be checking here # Read output of SURVIVOR altered_fa_path = "{}.fasta".format(prefix) insertions_fa_path = "{}.insertions.fa".format(prefix) sim_vcf = "{}.vcf".format(prefix) # Update VCF temp_vcf = os.path.join(temp_dir, "temp.vcf") update_vcf(temp_ref_fa, insertions_fa_path, sim_vcf, temp_vcf, pos_padding=padding) # Merge seqs and variants entries into single FA/VCF files # Add the initial and last 800bp back to the altered fasta altered_seq = pysam.FastaFile(altered_fa_path).fetch(name) altered_seq = update_altered_fa(ref_seq, altered_seq, padding) add_fasta_entry(name, altered_seq, out_altered_fa_fh) add_fasta_entry(name, ref_seq, out_ref_fa_fh) vcf_reader = pysam.VariantFile(temp_vcf) header = vcf_reader.header if not out_vcf_fh: out_vcf_fh = pysam.VariantFile(out_vcf_path, 'w', header=header) for record in vcf_reader: out_vcf_fh.write(record) # Remove temporary files. import shutil shutil.rmtree(temp_dir) out_altered_fa_fh.close() out_ref_fa_fh.close() out_vcf_fh.close() vcf_compress(out_vcf_path)
def make_sampledata(args): if isinstance(args, tuple): vcf_subset = args[2] args[0].output_file = str(args[1]) args = args[0] else: vcf_subset = None try: git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]) git_provenance = { "repo": "[email protected]:mcveanlab/treeseq-inference.git", "hash": git_hash.decode().strip(), "dir": "human-data", "notes:": ("Use the Makefile to download and process the upstream data files" ), } except FileNotFoundError: git_hash = "Git unavailable" git_provenance = "Git unavailable" data_provenance = { "ancestral_states_url": args.ancestral_states_url, "reference_name": args.reference_name, } # Get the ancestral states. fasta = pysam.FastaFile(args.ancestral_states_file) # NB! We put in an extra character at the start to convert to 1 based coords. ancestral_states = "X" + fasta.fetch(reference=fasta.references[0]) # The largest possible site position is len(ancestral_states). Positions must # be strictly less than sequence_length, so we add 1. sequence_length = len(ancestral_states) + 1 converter_class = { "1kg": ThousandGenomesConverter, "sgdp": SgdpConverter, "hgdp": HgdpConverter, "max-planck": MaxPlanckConverter, "afanasievo": AfanasievoConverter, "1240k": ReichConverter, } try: with tsinfer.SampleData(path=args.output_file, num_flush_threads=1, sequence_length=sequence_length) as samples: converter = converter_class[args.source](args.data_file, ancestral_states, samples, args.target_samples) if args.metadata_file: converter.process_metadata(args.metadata_file, args.progress) else: converter.process_metadata(args.progress) if vcf_subset is not None: report = converter.process_sites( vcf_subset=vcf_subset, show_progress=args.progress, max_sites=args.max_variants, ) else: report = converter.process_sites(show_progress=args.progress, max_sites=args.max_variants) samples.record_provenance( command=sys.argv[0], args=sys.argv[1:], git=git_provenance, data=data_provenance, ) assert np.all(np.diff(samples.sites_position[:]) > 0) except Exception as e: os.unlink(args.output_file) if report["num_sites"] == 0: return report raise e if report["num_sites"] == 0: os.unlink(args.output_file) return report
def openFile(self, dataFile): return pysam.FastaFile(dataFile)
def test_open_file_with_explicit_index_succeeds(self): with pysam.FastaFile(self.filename, filepath_index=self.filename + ".fai") as inf: self.assertEqual(len(inf), 2)
def run(subcommand): args = get_args(subcommand) if subcommand == "reclassification": nl.filterate_by_panel( args.input_vcf, args.output_vcf, pysam.FastaFile(args.fasta), args.non_somatic_panel, ) print("rnaindel reclassification completed successfully.", file=sys.stdout) sys.exit(0) data_dir = args.data_dir.rstrip("/") model_dir = "{}/models".format(data_dir) # database check path2cosmic = pathlib.Path("{}/cosmic".format(data_dir)) if not path2cosmic.exists(): print( "Please download the latest database: http://ftp.stjude.org/pub/software/RNAIndel/" ) sys.exit(1) if subcommand == "nonsomatic" or subcommand == "recurrence": cosmic = pysam.TabixFile( "{}/cosmic/CosmicCodingMuts.indel.vcf.gz".format(data_dir)) if subcommand == "nonsomatic": nl.make_non_somatic_panel( args.vcf_list, args.output_vcf, pysam.FastaFile(args.fasta), cosmic, args.count, ) print("rnaindel nonsomaic completed successfully.", file=sys.stdout) sys.exit(0) else: nl.annotate_recurrence(args.vcf_list, pysam.FastaFile(args.fasta), cosmic, args.out_dir) print("rnaindel recurrence completed successfully.", file=sys.stdout) sys.exit(0) log_dir = args.log_dir.rstrip("/") if subcommand == "training": df = tl.input_validator(args.training_data, args.indel_class) # downsampling artifact_ratio, ds_f_beta, ds_precision = tl.downsampler( df, args.k_fold, args.indel_class, args.ds_beta, args.process_num, args.downsample_ratio, ) # feature_selection selected_features, fs_f_beta, fs_precision = tl.selector( df, args.k_fold, args.indel_class, artifact_ratio, args.fs_beta, args.process_num, args.feature_names, ) # parameter tuning feature_lst = selected_features.split(";") max_features, pt_f_beta, pt_precision = tl.tuner( df, args.k_fold, args.indel_class, artifact_ratio, feature_lst, args.pt_beta, args.process_num, args.auto_param, ) # update models tl.updater(df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir) # make report tl.reporter( args.indel_class, args.ds_beta, ds_f_beta, ds_precision, artifact_ratio, args.fs_beta, fs_f_beta, fs_precision, selected_features, args.pt_beta, pt_f_beta, pt_precision, max_features, args.log_dir, ) msg = ("single-nucleotide indels" if args.indel_class == "s" else "multi-nucleotide indels") print("rnaindel training for " + msg + " completed successfully.", file=sys.stdout) else: create_logger(log_dir) alignments = pysam.AlignmentFile(args.bam) genome = pysam.FastaFile(args.fasta) refgene = "{}/refgene/refCodingExon.bed.gz".format(data_dir) exons = pysam.TabixFile(refgene) protein = "{}/protein/proteinConservedDomains.txt".format(data_dir) dbsnp = pysam.TabixFile("{}/dbsnp/dbsnp.indel.vcf.gz".format(data_dir)) clinvar = pysam.TabixFile( "{}/clinvar/clinvar.indel.vcf.gz".format(data_dir)) cosmic = pysam.TabixFile( "{}/cosmic/CosmicCodingMuts.indel.vcf.gz".format(data_dir)) germline_db = pysam.TabixFile( args.germline_db) if args.germline_db else None # input validation rl.input_validator(alignments, genome, args.uniq_mapq) # region analysis region = args.region if subcommand == "analysis" else None # preprocessing # variant calling will be performed if no external VCF is supplied if not args.input_vcf: with tempfile.TemporaryDirectory() as tmp_dir: # indel calling bambino_output = os.path.join(tmp_dir, "bambino.txt") bl.bambino(args.bam, args.fasta, bambino_output, args.heap_memory, region) # preprocess indels from the built-in caller df, chr_prefixed = rl.indel_preprocessor( bambino_output, genome, alignments, exons) df = rl.indel_rescuer(df, args.fasta, args.bam, chr_prefixed, args.process_num) else: # preprocess indels from external VCF df, chr_prefixed = rl.indel_vcf_preprocessor( args.input_vcf, genome, alignments, exons, region) df = rl.indel_rescuer( df, args.fasta, args.bam, chr_prefixed, args.process_num, external_vcf=True, ) # indel annotation df = rl.indel_annotator(df, genome, exons, chr_prefixed) # feature calculation if subcommand == "feature": df, df_filtered_premerge = rl.indel_sequence_processor( df, genome, alignments, args.uniq_mapq, chr_prefixed, softclip_analysis=args.softclip_analysis, ) else: coverage_in_trainingset = "{}/models/coverage.txt".format(data_dir) downsample_thresholds = {} with open(coverage_in_trainingset) as f: for line in f: if line.startswith("s"): downsample_thresholds["single_nuleotide_indels"] = int( line.rstrip().split("\t")[1]) else: downsample_thresholds["multi_nuleotide_indels"] = int( line.rstrip().split("\t")[1]) df, df_filtered_premerge = rl.indel_sequence_processor( df, genome, alignments, args.uniq_mapq, chr_prefixed, softclip_analysis=args.softclip_analysis, downsample_thresholds=downsample_thresholds, ) df = rl.indel_protein_processor(df, refgene, protein) # merging equivalent indels df, df_filtered_postmerge = rl.indel_equivalence_solver( df, genome, refgene, chr_prefixed) # SNP annotation df = rl.indel_snp_annotator(df, genome, dbsnp, clinvar, germline_db, chr_prefixed) # subcommand "feature" exits here if subcommand == "feature": df = rl.indel_feature_reporter(df, genome, args.output_tab, chr_prefixed) print("rnaindel feature completed successfully.", file=sys.stdout) sys.exit(0) # prediction df = rl.indel_classifier(df, model_dir, args.process_num) # concatenating invalid(filtered) entries df_filtered = pd.concat( [df_filtered_premerge, df_filtered_postmerge], axis=0, ignore_index=True, sort=True, ) # panel of non somatic default_pons = pysam.TabixFile( os.path.join(args.data_dir, "non_somatic/non_somatic.vcf.gz")) user_pons = (pysam.TabixFile(args.non_somatic_panel) if args.non_somatic_panel else None) df = rl.indel_reclassifier(df, genome, default_pons, user_pons, cosmic, chr_prefixed) # postProcessing & VCF formatting df, df_filtered = rl.indel_postprocessor(df, df_filtered, genome, exons, chr_prefixed) rl.indel_vcf_writer( df, df_filtered, args.fasta, genome, alignments, chr_prefixed, args.output_vcf, model_dir, __version__, ) print("rnaindel analysis completed successfully.", file=sys.stdout)
def setUp(self): self.file = pysam.FastaFile(os.path.join(BAM_DATADIR, "ex1.fa"))
def test_open_file_without_index_succeeds(self): with pysam.FastaFile(self.filename) as inf: self.assertEqual(len(inf), 2)
def __call__(self): fastaFile = pysam.FastaFile(self.args.fastainput) bamFile = pysam.AlignmentFile(self.args.BAMinput, "rb") ssl_settings = {'ca':self.args.sslpath} con = MySQLdb.connect(self.args.server, self.args.user, self.args.password, self.args.database, ssl=ssl_settings) with con: cur = con.cursor() cur.execute("USE " + self.args.database) def batch_gen(data, batch_size): for i in range(0, len(data), batch_size): yield data[i:i+batch_size] references = sorted(set(bamFile.getrname(read.tid) for read in samfile.fetch())) referencesLeng = sorted(set(len(fastaFile.fetch(reference=str(item)))for item in references)) for ref, leng in zip(references, referencesLeng): print ref, leng cur.execute('INSERT INTO templates(protein, length) VALUES(%s, %s)' ,(ref, leng)) for reference in references: returned_position_lines=[] length=0 refcodonpos=0 counter=0 for codon in batch_gen(fastaFile.fetch(reference=str(reference)),3): length+=3 markerlist=[] referenceid = str(reference)+ ' ' refnucpos1=0 +(3*refcodonpos) refnucpos2=1 +(3*refcodonpos) refnucpos3=2 +(3*refcodonpos) if 1 <= (refcodonpos+1) <= 9: refcodonposid = str(refcodonpos+1)+ " " else: refcodonposid = str(refcodonpos+1) refAAid = str(Seq(codon).translate()[0]) marker_list=[] for read in samfile.fetch(): read_codon=[] for seq, pos in zip(read.seq,AlignedSegment.get_reference_positions(read)): if pos == refnucpos1: read_codon.append(seq) if pos == refnucpos2: read_codon.append(seq) if pos == refnucpos3: read_codon.append(seq) if any(read_codon) is True: if len(read_codon) == 3: counter+=1 if ''.join(read_codon) == codon: marker_list.append('.') else: marker_list.append(str(Seq("".join(read_codon)).translate()[0])) print (referenceid, refcodonposid, refAAid, counter, ''.join(str(item)for item in marker_list)) returned_position_lines.append(''.join(str(item)for item in marker_list)) cur.execute("INSERT INTO sites(template_id, position, wild_type_AA) VALUES((SELECT id from templates WHERE protein=%s), %s, %s)" ,(reference, refcodonposid, refAAid)) counter=0 refcodonpos+=1 print returned_position_lines AAs = ('A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V','*') for AA in AAs: position=0 for line in returned_position_lines: position+=1 count=0 for readAA in line: if readAA==AA: count+=1 if (count >= 1): print count, AA, position cur.execute("INSERT INTO substitutions(site_id, substitution, count) VALUES((SELECT id from sites WHERE position=%s AND template_id=(SELECT id from templates WHERE protein=%s)), %s, %s)" ,(position, reference, AA, count)) con.commit() fastaFile.close() bamFile.close()
def convert_bed_to_vcf(bed_filename, reference_filename, vcf_filename, sample, variant_type): # Get variants. if variant_type == "sv": columns = (0, 1, 2, 3, 4, 5, 7, 9, 10, 12, 14) names = ("chrom", "start", "end", "sv_call", "event_size", "sv_sequence", "contig", "contig_start", "contig_end", "genotype", "repeat_type") fmt = ["GT"] elif variant_type == "indel": # chr1 94824 94827 3 Cttttcttttttttt 1 1 29.04 deletion columns = (0, 1, 2, 3, 4, 5, 6, 7, 8) names = ("chrom", "start", "end", "event_size", "sv_sequence", "contig_support", "contig_depth", "depth", "sv_call") fmt = ["GT"] elif variant_type == "inversion": columns = (0, 1, 2, 3, 4, 5) names = ("chrom", "start", "end", "sv_call", "contig_support", "contig_depth") fmt = [""] else: raise Exception("Unsupported variant type: %s" % variant_type) calls = pd.read_table( bed_filename, low_memory=False, keep_default_na=False, index_col=False, header=0) #, header=None, usecols=columns, names=names) calls["sample_name"] = sample calls["call_id"] = "." calls["quality"] = "30" #calls.apply(calculate_variant_quality, axis=1) calls["filter"] = "PASS" # Make sure the sv length and sv sequence agree # calls["svLen"] = calls.apply(lambda row: len(GetSeq(row["svSeq"])), axis=1) pd.to_numeric(calls["tStart"]) pd.to_numeric(calls["tEnd"]) # Get the reference base at the position of the variant start. reference = pysam.FastaFile(reference_filename) calls["reference"] = calls.apply(lambda row: reference.fetch( row["#chrom"], row["tStart"], row["tStart"] + 1).upper(), axis=1) # Update start position to be 1-based. calls["origTStart"] = calls["tStart"] calls["CHROM"] = calls["#chrom"] calls["POS"] = calls.apply(lambda row: GetStart(row), axis=1) if args.addci is not None: calls["CIPOS"] = ["-{},{}".format(args.addci, args.addci)] * len(calls) calls["CIEND"] = ["-{},{}".format(args.addci, args.addci)] * len(calls) # Build an INFO field for each call. calls["svShort"] = calls.apply(lambda row: GetType(row["svType"]), axis=1) if variant_type == "sv": infoKeys = [("END", "tEnd"), ("SVTYPE", "svShort"), ("SVLEN", "svLen"), ("CONTIG", "qName"), ("CONTIG_START", "qStart"), ("CONTIG_END", "qEnd"), ("SEQ", "svSeq")] if "is_trf" in calls: infoKeys.append(("IS_TRF", "is_trf")) if args.addci is not None: infoKeys.append(("CIEND", "CIEND")) infoKeys.append(("CIPOS", "CIPOS")) if len(args.fields) > 0: extraKeys = [(args.fields[i], args.fields[i + 1]) for i in range(0, len(args.fields), 2)] infoKeys += extraKeys if args.seq: calls["reference"] = calls.apply( lambda row: GetRefSeq(row, reference), axis=1) calls["alt"] = calls.apply(lambda row: GetAltSeq(row, reference), axis=1) else: calls["reference"] = calls.apply( lambda row: GetRefSeq(row, reference, 1), axis=1) calls["alt"] = calls.apply( lambda row: "<%s>" % row.svType[:3].upper(), axis=1) calls["svLen"] = calls.apply(lambda row: GetSVLen(row), axis=1) calls["info"] = calls.apply(lambda row: ";".join([ "=".join(map(str, (item[0], row[item[1]]))) for item in (infoKeys) ]), axis=1) # import pdb # pdb.set_trace() calls["svLen"] = calls.apply(lambda row: ParseSVLen(row["svLen"]), axis=1) calls["format"] = ":".join(fmt) if "hap" in calls: calls["genotype"] = calls.apply(lambda row: GetGenotype(row.hap), axis=1) else: calls["genotype"] = ["./."] * len(calls["tEnd"]) elif variant_type == "indel": calls["reference"] = calls.apply(lambda row: GetRefSeq(row, reference), axis=1) calls["alt"] = calls.apply(lambda row: GetAltSeq(row, reference), axis=1) calls["format"] = ":".join(fmt) if "hap" in calls: calls["genotype"] = calls.apply(lambda row: GetGenotype(row.hap), axis=1) else: calls["genotype"] = ["./."] * len(calls["tEnd"]) calls["svLen"] = calls.apply(lambda row: GetSVLen(row), axis=1) calls["info"] = calls.apply(lambda row: ";".join([ "=".join(map(str, item)) for item in (("END", row["tEnd"]), ("SVTYPE", row["svType"]), ("SVLEN", row["svLen"]), ("SAMPLES", row["sample_name"]), ("SEQ", row["svSeq"])) ]), axis=1) elif variant_type == "inversion": calls["alt"] = "<INV>" calls["info"] = calls.apply(lambda row: ";".join([ "=".join(map(str, item)) for item in ( ("END", row["tEnd"]), ("SVTYPE", row["svType"]), ("SVLEN", row["svLen"]), ("SAMPLES", row["sample_name"]), ) ]), axis=1) simple_calls = calls[[ "#chrom", "POS", "call_id", "reference", "alt", "quality", "filter", "info", "format", "genotype" ]].rename( { "#chrom": "#CHROM", "reference": "REF", "call_id": "ID", "quality": "QUAL", "info": "INFO", "alt": "ALT", "filter": "FILTER", "format": "FORMAT", "genotype": sample }, axis=1) faiFile = open(args.reference + ".fai") fai = [] for line in faiFile: vals = line.split() fai.append([vals[0], vals[1]]) # Save genotypes as tab-delimited file. with open(vcf_filename, "w") as vcf: vcf.write("##fileformat=VCFv4.2\n") vcf.write("##fileDate=%s\n" % datetime.date.strftime(datetime.date.today(), "%Y%m%d")) vcf.write("##source={}\n".format(args.source)) vcf.write( '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">' + "\n") vcf.write( '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">' + "\n") vcf.write( '##INFO=<ID=END,Number=1,Type=Integer,Description="End coordinate of this variant">' + "\n") vcf.write( '##INFO=<ID=CONTIG,Number=1,Type=String,Description="Name of alternate assembly contig">' + "\n") vcf.write( '##INFO=<ID=CONTIG_START,Number=1,Type=Integer,Description="Start coordinate of this variant in the alternate assembly contig">' + "\n") vcf.write( '##INFO=<ID=CONTIG_END,Number=1,Type=Integer,Description="End coordinate of this variant in the alternate assembly contig">' + "\n") vcf.write( '##INFO=<ID=SEQ,Number=1,Type=String,Description="Sequence associated with variant">' + "\n") for i in range(0, len(fai)): vcf.write( "##contig=<ID={},length={}>".format(fai[i][0], fai[i][1]) + "\n") vcf.write("##SAMPLE=<ID={}>\n".format(args.sample)) vcf.write( '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' + "\n") if args.info is not None: vcf.write("\n".join(args.info) + "\n") simple_calls.to_csv(vcf, sep="\t", index=False)
def main(): # argument parser parser = argparse.ArgumentParser() parser.add_argument('-fourfold', help='bed file of fourfold sites with final column listing gene name', required=True) parser.add_argument('-ref', help='Reference genome', required=True) parser.add_argument('-gc_thres', help=argparse.SUPPRESS, default=72) args = parser.parse_args() ref_genome = pysam.FastaFile(args.ref) trans_data = pysam.TabixFile(args.fourfold) chromosomes = trans_data.contigs out_stem = args.fourfold.replace('.bed.gz', '') bed_out = '{}_maxgc{}.bed'.format(out_stem, args.gc_thres) gc_out = '{}_gc_content.txt'.format(out_stem) gene_dict = {} # loop through all chromosomes in the fourfold bed for chromo in chromosomes: ref_str = ref_genome.fetch(chromo) # process each bed chromosome for line in trans_data.fetch(chromo, parser=pysam.asTuple()): chromo, start, stop, trans_id = line[0], int(line[1]), int(line[2]), line[3] # add relevant keys if chromo not in gene_dict.keys(): gene_dict[chromo] = {trans_id: [0, 0, 0]} if trans_id not in gene_dict[chromo].keys(): gene_dict[chromo][trans_id] = [0, 0, 0] # get ref string for region ref_seq = ref_str[start: stop].upper() at = ref_seq.count('A') + ref_seq.count('T') gc = ref_seq.count('G') + ref_seq.count('C') gene_dict[chromo][trans_id][0] += gc gene_dict[chromo][trans_id][1] += at percent_gc = (gene_dict[chromo][trans_id][0] / float(gene_dict[chromo][trans_id][0] + gene_dict[chromo][trans_id][1])) * 100.0 gene_dict[chromo][trans_id][2] = percent_gc trans_data.close() # process gc content with open(gc_out, 'w') as gc_file: failing_trans = [] print('transcript\tgc\tat\tpercent_gc', file=gc_file) for x in gene_dict.keys(): for transcript in gene_dict[x].keys(): at_cont, gc_cont, gc_percent = gene_dict[x][transcript] print(transcript, at_cont, gc_cont, gc_percent, sep='\t', file=gc_file) if gc_percent > args.gc_thres: failing_trans.append(transcript) # filter bed with open(bed_out, 'w') as bed_file: for bed_line in gzip.open(args.fourfold): trans_id = bed_line.rstrip().split()[-1] if trans_id in failing_trans: continue else: print(bed_line.rstrip(), file=bed_file) # bgzip and tabix subprocess.call('bgzip {}'.format(bed_out), shell=True) subprocess.call('tabix -pbed {}.gz'.format(bed_out), shell=True)
def main(): description = """ SplitStrains detects minor/major strains and classify reads. In addition, it produces 2 plots: histogram and scatter plots for visual inspecting and parameter tunning (see figures in output dir). """ parser = argparse.ArgumentParser(description=description, add_help=False) arg_required = parser.add_argument_group('required arguments') arg_required.add_argument(dest='bamFilePath', metavar='bamFilePath', help='Input bam file') arg_required.add_argument('-o', metavar='dir', required=True, dest='outputDir', help='Output directory.') arg_required.add_argument('-fd', metavar='n', required=True, default=75, dest='depthThreshold', type=int, help='Do not consider pileup columns with the depth percentage less than n percent. Setting this to 75 means ignore sites with depth coverage less than 75%% of the bam avg depth. Default=75.') arg_optional = parser.add_argument_group('optional arguments') arg_optional.add_argument("-h", "--help", action="help", help="show this help message and exit") arg_optional.add_argument('-c','--classify', action='store_true', help='If this option is specified then the program will run reads classification, otherwise it will detect means and produce histogram png.') arg_optional.add_argument('-z','--reuse', action='store_true', help='If this flag is specified the program will reuse the csv file from the previous run.') arg_optional.add_argument('-mo', metavar='gmm/bmm', dest='model', type=str, help='Specify clustering model: GMM or BMM. Default GMM.', default='gmm') arg_optional.add_argument('-f', metavar="plotName", dest='plotName', default='plot', help='Name for the histogram figure.') arg_optional.add_argument('-s', metavar='n', dest='regionStart', type=int, help='Specify the start position on the genome. Default=0.') arg_optional.add_argument('-e', metavar='n', dest='regionEnd', type=int, help='Specify the end position on the genome. Default is the genome length.') arg_optional.add_argument('-r', metavar='ref', dest='ref', help='Genome reference. It is highly recommended to use the default reference file for compatibility with the GFF file.', default='refs/tuberculosis.fna') arg_optional.add_argument('-b', metavar='gff', dest='gff', help='Use gff file to process only gff regions. It is highly recommended to use the default GFF file as it takes care of problematic genomic regions.', default='refs/tuberculosis.filtered-intervals.gff') arg_optional.add_argument('-i', metavar='n', default=150, dest='step', type=int, help='Step for snp cluster detection. Default=150.') arg_optional.add_argument('-g', metavar='n', default=2, type=int, dest='components', help='GMM model components. Default=2.') arg_optional.add_argument('-ft', metavar='n', default=1, dest='proportion_count_threshold', help='Filter out proportions which have count less than n. Default=1') arg_optional.add_argument('-fe', metavar='n', default=0, dest='entropy_thresh', help='Entropy filtering threshold. Set to 0 to turn off entropy filtering. Default=0.') arg_optional.add_argument('-a', metavar='n', default=0.05, dest='alpha_level', help='Significance level alpha. The probability of rejecting a single strain hypothesis when it is true. Default=0.05.') arg_optional.add_argument('-fes', metavar='n', type=int, default=70, dest='entropy_step', help='Entropy filtering step. Defines the step length on freqVec.csv for entropy filtering computation. Default=200.') arg_optional.add_argument('-u', metavar='n', type=int, default=90, dest='upperLimit', help='Do not consider proportion of bases beyond n value. Default=90.') arg_optional.add_argument('-l', metavar='n', type=int, default=10, dest='lowerLimit', help='Do not consider proportion of bases below n value. Default=10.') arg_optional.add_argument('-m', metavar='n', type=int, default=20, dest='mapQuality', help='Do not consider reads below n map quality. Default=20.') arg_optional.add_argument('-q', metavar='n', type=int, default=10, dest='baseQuality', help='Do not consider bases below n quality. Default=10.') args = parser.parse_args() components = args.components # gmm components. For 2 strains 2 components. proprtionCountThresh = args.proportion_count_threshold depthThreshold = args.depthThreshold # pileup columns with depth less than filter value are skipped. Helps to reduce noise for gmm fitting lowerLimit = args.lowerLimit upperLimit = args.upperLimit regionStart = args.regionStart regionEnd = args.regionEnd step = args.step baseQuality = args.baseQuality # samtools default mpileup quality filter is 13 mapQuality = args.mapQuality outputDir = args.outputDir plotName = args.plotName refFastaPath = args.ref # path to a ref fasta file bamFilePath = args.bamFilePath # path to bam file gffFilePath = args.gff entropy_step = args.entropy_step ethreshold = float(args.entropy_thresh) useModel = args.model reuseFreqVec = args.reuse alpha_level = float(args.alpha_level) logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO, stream=sys.stdout) # Ref path installed_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) if refFastaPath == "refs/tuberculosis.fna": refFastaPath = os.path.join(installed_path, refFastaPath) if gffFilePath == "refs/tuberculosis.filtered-intervals.gff": gffFilePath = os.path.join(installed_path, gffFilePath) try: samfile = pysam.AlignmentFile(bamFilePath, "rb" ) # read bam file refFile = pysam.FastaFile(refFastaPath) # read reference fasta file except FileNotFoundError: logging.error(f'{bamFilePath} or {refFastaPath} is not found.') exit() logging.info('splitStrain.py has started.') refName = samfile.references[0] refLength = samfile.lengths[0] # Parsing interval if not regionStart: regionStart = 0 if not regionEnd: regionEnd = refLength if (regionEnd > refLength): logging.warning('regionEnd > reference length.') interval = regionEnd - regionStart if interval < 1000000: logging.warning(f'the interval length {interval} is too small.') logging.info(f'sample name: {bamFilePath}') logging.info(f'reference name: {refName}, reference length: {refLength}') logging.info(f'regionStart: {regionStart}, regionEnd: {regionEnd}') logging.info(f'depth threshold percent: {depthThreshold}') logging.info(f'entropy threshold: {ethreshold}') intervals = [] # list of Interval objects. This will be populated if gff file is provided freqVec = [] # vector format [a prop, c prop, t prop, g prop, position, depth] freqVecCSV = 'freqVec.csv' # Create output directory os.makedirs(outputDir, exist_ok=True) # compute freqVec if reuseFreqVec == False: # If gff file is provided, compute on regions specified in a gff file if gffFilePath != '': logging.info(f'using gff: {gffFilePath}') intervals = getIntervals(gffFilePath, regionStart, regionEnd) for interval in intervals: freqVec = computeDataFromSam(freqVec, samfile, refFile, baseQuality, mapQuality, interval.start, interval.end) else: freqVec = computeDataFromSam(freqVec, samfile, refFile, baseQuality, mapQuality, regionStart, regionEnd) freqVec = np.array(freqVec) # terminate if freqVec has less than 2 entries if freqVec.size < 2: logging.warning('No SNPs found on the given interval.') exit() # write freqVec to a file try: np.savetxt(f'{outputDir}/{freqVecCSV}', freqVec, delimiter=',') # np.savetxt(f'{outputDir}/{freqVecCSV}', freqVec, delimiter=',', fmt='%i') except IOError: logging.error(f'failed to save the csv {outputDir}/{freqVecCSV}.') exit() # if reuse is set then load freqVec else: try: logging.info(f'loading csv {outputDir}/{freqVecCSV} from the previous run') freqVec = np.loadtxt(open(f'{outputDir}/{freqVecCSV}', 'rb'), delimiter=',', dtype=float) assert len(freqVec) != 0, f'{freqVecCSV} is empty.' except IOError: logging.error(f'failed to load the csv {outputDir}/{freqVecCSV}. Please check if the file exists.') exit() except AssertionError as error: logging.error(error) exit() logging.debug('Starting filterVec()') originalFreqVec = freqVec.copy() # compute avg depth using freqVec avgDepth = freqVec[:,-1].mean() minDepth = avgDepth * depthThreshold / 100 freqVec, entropyVec = filterVec(freqVec, minDepth, ethreshold, entropy_step, lowerLimit, upperLimit) plotScatter(outputDir, freqVec, originalFreqVec, plotName, entropyVec, regionStart, regionEnd, lowerLimit, upperLimit) num_iter = 20 init_p = 0.7 init_err = 0.001 freqVec = freqVec[np.max(freqVec[:,:4], axis=1) < upperLimit] # call single strain if not enough variation is found if len(freqVec) < 5: logging.info(f'Not enough variant sites.') writeResult(bamFilePath, 0 , 0, alpha_level, [1]) exit() # test null and alt hypthesis thresh, LR = likelyhood_ratio_test(freqVec, alpha_level, upperLimit, num_iter, init_p, init_err) # if test calls single strain exit if LR < thresh: # logging.info(f'LR test result: {bamFilePath} Single strain.') writeResult(bamFilePath, LR , thresh, alpha_level, [1]) exit() if components == 2: # consider reference base frequencies in the histogram and fitting freqVecFlat = np.absolute(freqVec[:,:-2].flatten()) else: # do not consider base frequencies. Ref bases frequencies will be filtered out since they are negative freqVecFlat = freqVec[:,:-2].flatten() freqVecFlat = freqVecFlat[freqVecFlat > lowerLimit] freqVecFlat = freqVecFlat[freqVecFlat < upperLimit] # TODO change box size to a parameter freqVecFlat = convolveVec(freqVecFlat, proprtionCountThresh, [1]) if freqVecFlat.size < components: logging.info(f'Not enough SNP frequencies.') writeResult(bamFilePath, LR , thresh, alpha_level, [1]) exit() # Fit data with Gaussian Mixture gmm = fitDataGMM(freqVecFlat, components) init_proportions = gmm.means_.flatten()/100 for p in init_proportions: if np.isclose(p,0): logging.error('Unable to fit the data. Check if depth filtering, entropy filtering or intervals are reasonable.') exit() # specify which model to use if useModel == 'bmm': # Fit data with Binomial Mixture avgDepth = int(freqVec[:,-1].mean()) bmm = fitDataBMM(freqVec, avgDepth, lowerLimit, upperLimit, init_proportions, components) bmm.set_prob(bmm.get_proportions()/np.sum(bmm.get_proportions())) model = Model(bmm) elif useModel == 'gmm': model = Model(gmm) else: logging.error('Wrong model name: Use either gmm or bmm.') exit() logging.info(f'using the model:{model}') means = model.get_strain_proportions() means = roundUP(means) if components == 2: if (means[0] > 50 and means[1] > 50) or (means[0] < 50 and means[1] < 50): logging.warning(f'result: Could not fit the data {bamFilePath}. Incorrect means:{means[0]}, {means[1]}. Possibly 50:50 split.') exit() writeResult(bamFilePath, LR , thresh, alpha_level, means/np.sum(means)) originalFrecVecFlat = originalFreqVec[:,:-2].flatten() originalFrecVecFlat = originalFrecVecFlat[originalFrecVecFlat > 2] originalFrecVecFlat = originalFrecVecFlat[originalFrecVecFlat < 98] plotHist(outputDir, originalFrecVecFlat, freqVecFlat, gmm, plotName) if args.classify == True: logging.info('starting strain separation') result = bayesClassifyReads(outputDir, originalFreqVec, refName, samfile, refFile, model, components, baseQuality, mapQuality, step) if result == 0: logging.info('separation is complete.') else: logging.error('separation was not completed.')
seq = refbase + "~" + seq # pad to 3 digits b/c ref length is max 3 digits mutation = format(pos + 1, "03") + "_" + seq mutated_reads[mutation].append(name) samfile.close() with open( "./mutations/{}.mutated_reads.pkl".format( os.path.basename(file)), "wb") as f: pickle.dump(mutated_reads, f) return mutated_reads # get reference sequence print("Getting reference sequence...") reffile = pysam.FastaFile("ref.fa") ref = reffile.fetch("ref", 0, 150) reffile.close() # get counter of all mutations print("Counting all mutations...") pathlib.Path('./mutations').mkdir(exist_ok=True) mutation_counter = Counter() for file in [f for f in os.listdir("./split") if f.endswith(".bam")]: mutation_counter += count_mutations_in_file(ref, os.path.join("./split", file)) # list 10 most common mutations mutation_list = mutation_counter.most_common(10) print("Most common mutations: " + str(mutation_list))
def data_processing(self): """ Generate the consensus sequence and find indels. Write the frequency file. Called by pathos pool :return: """ self.log.info("Begin Processing {}".format(self.index_name)) """ Summary_Data List: index_name, total aberrant, left deletions, right deletions, total deletions, left insertions, right insertions, total insertions, microhomology, number filtered, target_name """ target_name = self.index_dict[self.index_name][7] self.summary_data = [self.index_name, 0, 0, 0, 0, 0, [0, 0], [0, 0], 'junction data', target_name, [0, 0]] junction_type_data = [0, 0, 0, 0, 0] read_results_list = [] results_freq_dict = collections.defaultdict(list) refseq = pysam.FastaFile(self.args.RefSeq) # Get the genomic 5' coordinate of the reference target region. try: start = int(self.target_dict[target_name][2]) except IndexError: self.log.error("Target file incorrectly formatted for {}".format(target_name)) return # Get the genomic 3' coordinate of the reference target region. stop = int(self.target_dict[target_name][3]) chrm = self.target_dict[target_name][1] # Get the sequence of the sgRNA. sgrna = self.target_dict[target_name][4] # Get the Target Region. This allows both types of genomic indices. try: refseq.fetch(chrm, start, stop) except KeyError: chrm = "chr{}".format(chrm) try: self.target_region = refseq.fetch(chrm, start, stop) except KeyError: self.target_region = str(pyfaidx.Fasta(self.args.RefSeq)[0]).upper() # Tool_Box.debug_messenger([target_name, self.target_region]) self.cutsite_search(target_name, sgrna, chrm, start, stop) self.window_mapping() loop_count = 0 start_time = time.time() split_time = start_time # Extract and process read 1 and read 2 from our list of sequences. for seq in self.sequence_list: loop_count += 1 if loop_count % 5000 == 0: self.log.info("Processed {} reads of {} for {} in {} seconds. Elapsed time: {} seconds." .format(loop_count, len(self.sequence_list), self.index_name, time.time() - split_time, time.time() - start_time)) split_time = time.time() consensus_seq = seq # No need to attempt an analysis of bad data. if consensus_seq.count("N") / len(consensus_seq) > float(self.args.N_Limit): self.summary_data[7][0] += 1 continue # No need to analyze sequences that are too short. if len(consensus_seq) <= int(self.args.Minimum_Length): self.summary_data[7][0] += 1 continue ''' The summary_data list contains information for a single library. [0] index name; [1] reads passing all filters; [2] left junction count; [3] right junction count; [4] insertion count; [5] microhomology count; [6] [No junction count, no cut count]; [7] [consensus N + short filtered count, unused]; [8] junction_type_data list; [9] target name; 10 [HR left junction count, HR right junction count] The junction_type_data list contains the repair type category counts. [0] TMEJ, del_size >= 4 and microhomology_size >= 2; [1] NHEJ, del_size < 4 and ins_size < 5; [2] insertions >= 5 [3] Junctions with scars not represented by the other categories; [4] Non-MH Deletions, del_size >= 4 and microhomology_size < 2 and ins_size < 5 ''' # count reads that pass the read filters self.summary_data[1] += 1 # The cutwindow is used to filter out false positives. cutwindow = self.target_region[self.cutsite-4:self.cutsite+4] sub_list, self.summary_data = \ SlidingWindow.sliding_window( consensus_seq, self.target_region, self.cutsite, self.target_length, self.lower_limit, self.upper_limit, self.summary_data, self.left_target_windows, self.right_target_windows, cutwindow, self.hr_donor) ''' The sub_list holds the data for a single consensus read. These data are [left deletion, right deletion, insertion, microhomology, consensus sequence]. The list could be empty if nothing was found or the consensus was too short. ''' if sub_list: read_results_list.append(sub_list) freq_key = "{}|{}|{}|{}|{}".format(sub_list[0], sub_list[1], sub_list[2], sub_list[3], sub_list[9]) else: continue if freq_key in results_freq_dict: results_freq_dict[freq_key][0] += 1 else: results_freq_dict[freq_key] = [1, sub_list] self.log.info("Finished Processing {}".format(self.index_name)) # Write frequency results file self.frequency_output(self.index_name, results_freq_dict, junction_type_data) # Format and output raw data if user has so chosen. if self.args.OutputRawData: self.raw_data_output(self.index_name, read_results_list) return self.summary_data
#ctg2_left_rc 177718 A ctg2_100x_PB_L_5164_1_0 None * #code based on: http://pysam.readthedocs.org/en/latest/ #argparse info: http://www.cyberciti.biz/faq/python-command-line-arguments-argv-example/ import pysam import argparse import csv parser = argparse.ArgumentParser( description='usage: samtools_view.py --bam reads.bam --bed bed_file.bed') parser.add_argument('--bam', help='Input bam file name', required=True) parser.add_argument('--bed', help='Input bedfile name', required=True) parser.add_argument('--fasta', help='Input fasta reference file name', required=True) args = parser.parse_args() bamfile = pysam.AlignmentFile(args.bam, "rb") fastafile = pysam.FastaFile(args.fasta) with open(args.bed) as bed: reader = csv.reader(bed, delimiter="\t") sites = list(reader) for site in sites: start = int(site[1]) end = int(site[2]) pileup = bamfile.pileup(site[0], start, end, stepper="all", max_depth=500000) for pileupColumn in pileup: for pileupRead in pileupColumn.pileups: if (pileupColumn.pos >= start) and (pileupColumn.pos < end) and (
for annotation in annotations: if annotation.consequence: cons_annotations.append(annotation) else: nocons_annotations.append(annotation) annotationFeatures = [] for annotation in annotations: if hasattr(annotation, 'features'): annotationFeatures.extend(annotation.features) else: annotationFeatures.append(annotation.name) annotationFeatures = OrderedDict.fromkeys( annotationFeatures).keys() # remove duplicates genome_index = pysam.FastaFile(reference) for annotation in annotations: if hasattr(annotation, 'load'): annotation.load(args) info_columns = vcf_reader.infos['CSQ'].desc.split('Format: ')[1].split('|') output_columns = ['Chrom', 'Pos', 'Ref', 'Alt', 'Type'] output_columns.extend(annotationFeatures) if args.header: stdout.write('#' + '\t'.join(output_columns) + '\n') # processing for record in vcf_reader:
def _check(self, files, expected): for file, exp in zip(files, expected): with pysam.FastaFile(file.name) as fh: self.assertEqual(exp, fh.references)
def main(): parser = argparse.ArgumentParser( description="Script to convert VCF files into tsinfer input.") parser.add_argument("source", choices=["1kg", "sgdp", "ukbb"], help="The source of the input data.") parser.add_argument("data_file", help="The input data file pattern.") parser.add_argument("ancestral_states_file", help="A vcf file containing ancestral allele states. ") parser.add_argument("output_file", help="The tsinfer output file") parser.add_argument( "-m", "--metadata_file", default=None, help="The metadata file containing population and sample data") parser.add_argument("-n", "--max-variants", default=None, type=int, help="Keep only the first n variants") parser.add_argument( "-p", "--progress", action="store_true", help="Show progress bars and output extra information when done") parser.add_argument( "--ancestral-states-url", default=None, help="The source of ancestral state information for provenance.") parser.add_argument("--reference-name", default=None, help="The name of the reference for provenance.") args = parser.parse_args() git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]) git_provenance = { "repo": "[email protected]:mcveanlab/treeseq-inference.git", "hash": git_hash.decode().strip(), "dir": "human-data", "notes:": ("Use the Makefile to download and process the upstream data files") } data_provenance = { "ancestral_states_url": args.ancestral_states_url, "reference_name": args.reference_name } # Get the ancestral states. fasta = pysam.FastaFile(args.ancestral_states_file) # NB! We put in an extra character at the start to convert to 1 based coords. ancestral_states = "X" + fasta.fetch(reference=fasta.references[0]) # The largest possible site position is len(ancestral_states). Positions must # be strictly less than sequence_length, so we add 1. sequence_length = len(ancestral_states) + 1 converter_class = { "1kg": ThousandGenomesConverter, "sgdp": SgdpConverter, "ukbb": UkbbConverter } try: with tsinfer.SampleData(path=args.output_file, num_flush_threads=2, sequence_length=sequence_length) as samples: converter = converter_class[args.source](args.data_file, ancestral_states, samples) converter.process_metadata(args.metadata_file, args.progress) converter.process_sites(args.progress, args.max_variants) samples.record_provenance(command=sys.argv[0], args=sys.argv[1:], git=git_provenance, data=data_provenance) except Exception as e: os.unlink(args.output_file) raise e print(samples)
def process_messages(processor_pipe, shared_all_loci): reads_to_save = [] print("Starting Message Processing") if USE_LOCAL: in_file = open( "/Users/siakhnin/data/RMNISTHS_30xdownsample_9999999_11000000.mapped.sr.msgpack", "r") read_source = msgpack.load(in_file, encoding='utf-8') in_file.close() else: read_source = KafkaConsumer( 'mapped_reads', group_id='rheos_common', bootstrap_servers=['localhost:9092'], value_deserializer=lambda m: json.loads(m.decode('utf-8'))) start_time = time.time() updated_loci = {} reference_file = "/Users/siakhnin/data/reference/genome.fa" ref = pysam.FastaFile(reference_file).fetch(region="20") reads_list = [] counter = 1 saver_notified = False for message in read_source: # message value and key are raw bytes -- decode if necessary! # e.g., for unicode: `message.value.decode('utf-8')` #print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, # message.offset, message.key, # message.value)) if processor_pipe.poll(): msg = processor_pipe.recv() if msg == SAVED_MSG: saver_notified = False if not saver_notified: processor_pipe.send(NEW_MSG) saver_notified = True print("Sending message to saver") if USE_LOCAL: my_read = message else: my_read = message.value updated_loci.update( process_read(my_read, ref, 0, len(ref), shared_all_loci, reads_to_save)) shared_all_loci.update(updated_loci) updated_loci = {} if counter % 1000 == 0: print("Processed {} messages. Updating shared dictionary".format( counter)) # ts = time.time() # shared_all_loci.update(updated_loci) # te = time.time() # print("Took {} to update {} loci".format(te - ts, len(updated_loci))) # updated_loci = {} #break counter += 1 if counter > 20000: break
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, bam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, vardict=None, lofreq=None, scalpel=None, strelka=None, arbitrary_vcfs=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): if arbitrary_vcfs is None: arbitrary_vcfs = [] # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header(truth) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header(cosmic) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header(dbsnp) # 6 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header(mutect) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header(varscan) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header(vardict) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header(lofreq) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header(scalpel) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header(strelka) arbitrary_file_handle = {} arbitrary_line = {} for ith_arbi, arbitrary_vcf_i in enumerate(arbitrary_vcfs): arbitrary_file_handle[ith_arbi] = genome.open_textfile( arbitrary_vcf_i) arbitrary_line[ith_arbi] = genome.skip_vcf_header( arbitrary_file_handle[ith_arbi]) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match(genome.pattern_chr_position, my_line) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: # First line: header_part_1 = out_header.replace('{', '').replace('}', '') additional_arbi_caller_numbers = sorted(arbitrary_file_handle.keys()) for arbi_caller_num in additional_arbi_caller_numbers: header_part_1 = header_part_1 + '\t' + 'if_Caller_{}'.format( arbi_caller_num) header_last_part = label_header.replace('{', '').replace('}', '') outhandle.write('\t'.join((header_part_1, header_last_part)) + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match(genome.pattern_chr_position, my_line) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format( mysites)) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates(bed_item[0], int(bed_item[1]) + 1, int(bed_item[2])) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates(pos_item[0], int(pos_item[1]), int(pos_item[1])) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates(fai_item[0], 1, int(fai_item[1])) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append(ref_base) alt_bases.append(first_alt) indel_lengths.append(indel_length) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value( 'COMMON') == '1' else 0 num_cases = variant_i.get_info_value( 'CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set(my_identifier_i) all_my_identifiers.append(my_identifier_i) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [ None ] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate( my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate( my_coordinate, varscan_line, varscan, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate( my_coordinate, vardict_line, vardict, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate( my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate( my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate( my_coordinate, strelka_line, strelka, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate( my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate( my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate( my_coordinate, cosmic_line, cosmic, chrom_seq) got_arbitraries = {} arbitrary_variants = {} for ith_arbi in arbitrary_file_handle: got_arbitraries[ith_arbi], arbitrary_variants[ ith_arbi], arbitrary_line[ ith_arbi] = genome.find_vcf_at_coordinate( my_coordinate, arbitrary_line[ith_arbi], arbitrary_file_handle[ith_arbi], chrom_seq) # Now, use pysam to look into the tBAM file(s), variant by variant from the input: for ith_call, my_call in enumerate(variants_at_my_coordinate): if is_vcf: # The particular line in the input VCF file: variant_id = ((my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ((my_coordinate[0], my_coordinate[1]), ref_base, first_alt) # Reset num_caller to 0 for each variant in the same coordinate num_callers = 0 #################### Collect Caller Vcf ####################: if mutect: mutect_classification, tlod, ecnt = annotate_caller.ssMuTect( variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = tlod = ecnt = nan if varscan: varscan_classification, score_varscan2 = annotate_caller.ssVarScan( variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = score_varscan2 = nan if vardict: vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict( variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan if lofreq: lofreq_classification = annotate_caller.ssLoFreq( variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.ssScalpel( variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification = annotate_caller.ssStrelka( variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = nan arbitrary_classifications = {} for ith_arbi_var in arbitrary_file_handle: arbi_classification_i = annotate_caller.anyInputVcf( variant_id, arbitrary_variants[ith_arbi_var]) arbitrary_classifications[ ith_arbi_var] = arbi_classification_i num_callers += arbi_classification_i # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants.keys(): judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP( variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add(ID_i) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC( variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add(ID_i) ########## ######### INFO EXTRACTION FROM BAM FILES ########## ######### # Tumor tBAM file: tBamFeatures = sequencing_features.from_bam( bam, my_coordinate, ref_base, first_alt, min_mq, min_bq) # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref_base, first_alt) # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring. seq_span_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 41), my_coordinate[1] + 40) seq_left_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 81), my_coordinate[1]) seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[1], my_coordinate[1] + 81) if len(seq_span_80bp) > 20: LC_spanning = sequencing_features.subLC( seq_span_80bp, 20) else: LC_spanning = math.nan if len(seq_left_80bp) > 20: left_LC = sequencing_features.subLC( seq_left_80bp, 20) else: left_LC = math.nan if len(seq_right_80bp) > 20: right_LC = sequencing_features.subLC( seq_right_80bp, 20) else: right_LC = math.nan LC_adjacent = min(left_LC, right_LC) LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join( my_identifiers) if my_identifiers else '.' ### out_line_part_1 = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_Strelka = strelka_classification, \ if_VarScan2 = varscan_classification, \ if_VarDict = vardict_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ Seq_Complexity_Span = LC_spanning_phred, \ Seq_Complexity_Adj = LC_adjacent_phred, \ M2_TLOD = tlod, \ M2_ECNT = ecnt, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_p_MannWhitneyU_MQ = '%g' % tBamFeatures['p_mannwhitneyu_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_p_MannWhitneyU_BQ = '%g' % tBamFeatures['p_mannwhitneyu_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length) additional_caller_columns = [] for arbi_key_i in additional_arbi_caller_numbers: additional_caller_columns.append( str(arbitrary_classifications[arbi_key_i])) additional_caller_columns = '\t'.join( additional_caller_columns) label_column = label_header.format( TrueVariant_or_False=judgement) if len(additional_arbi_caller_numbers) > 0: out_line = '\t'.join( (out_line_part_1, additional_caller_columns, label_column)) else: out_line = '\t'.join( (out_line_part_1, label_column)) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = [ ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict, lofreq, scalpel, strelka ] [ opened_files.append(extra_opened_file) for extra_opened_file in arbitrary_file_handle.values() ] [opened_file.close() for opened_file in opened_files if opened_file]
def find_saturation(bam, ref, start, end, chrom, rs, re, output): """ Reads the BAM file and counts each base at a specific aligned position. Compares those reads to the reference and calculates frequency of the SNPs at each position. :param string bam: BAM file pathway. :param string ref: Reference file pathway. :param int start: Start position. :param int end: End position. :param float rs: Range start number. :param float re: Range end number. :return: a dictionary of mutations at each position. """ # Read the BAM file. bamfile = pysam.AlignmentFile(bam, 'rb') # Read reference FASTA file. fastafile = pysam.FastaFile(ref) mutations = {} total_reads = 0 # fetch() returns all reads overlapping a region sorted by the first aligned base in the reference # sequence. Note that it will also return reads that are only partially overlapping with # the region. # Create the dictionary of dictionaries for SNPs at each position. for read in bamfile.fetch(chrom, start, end): # read.positions gives an array of all the positions of each sequence. positions = read.get_reference_positions() sequence = read.query_alignment_sequence # Don't want soft clipped bases. quality = read.mapping_quality q_quality = read.query_alignment_qualities # Disregard any reads that don't have high mapping accuracy. if quality < 40: continue tmp = 0 for i in range(tmp, len(positions)-1): # Check the probability that the base at this position is wrong. if q_quality[i] < 30 or sequence[i] == 'N': continue # Make sure that we compute just the specified region. if positions[i] >= end or positions[i] < start: break # Positions start at index 0 which is fine since reference also starts at 0. # Position number will be incremented for VCF file creation. if positions[i] not in mutations: atcg = {'A': 0, 'T':0, 'C':0, 'G':0} atcg[sequence[i]] += 1 mutations[positions[i]] = atcg else: mutations[positions[i]][sequence[i]] += 1 bamfile.close() # No ref and separate mutation nucleotides fractions approach . #mutations = calculate_fractions(mutations, fastafile) # No ref and collected non-ref mutations fractions apprach. mutations, positions = calculate_fractions_overall(mutations, fastafile, chrom, re, rs, output) fastafile.close() return mutations, positions
import pysam import math import statistics bam = pysam.AlignmentFile("/home/minime/Scrivania/TEST/20161213_02_Conn.bam", "rb") fasta = pysam.FastaFile("/home/minime/NGS_TOOLS/hg19/ucsc.hg19.fasta") chrom = 'chr2' start = 21225013 stop = 21225014 #print bam.count(reference=chrom, start=start, end=stop, until_eof=False, read_callback='nofilter') print bam.count_coverage(reference=chrom, start=start, end=stop) #print bam.parse_region(reference=chrom, start=start, end=stop, tid=None) # for pc in bam.pileup(reference=chrom, start=start, end=stop): # for reads in pc.pileups: # print reads QB = [] MQ = [] BQ = [] for pileupcolumn in bam.pileup(reference=chrom, start=start, end=stop): if pileupcolumn.reference_pos >= start and pileupcolumn.reference_pos < stop: for pileupread in pileupcolumn.pileups: QB += [ pileupread.alignment.query_qualities[pileupread.query_position] ] MQ += [pileupread.alignment.mapping_quality]
def main(): # arguments parser = argparse.ArgumentParser(description='') parser.add_argument('-call_fa', help='Callable sites fasta file', required=True) parser.add_argument('-vcf', help='Vcf file to extract site frequencies from', required=True) parser.add_argument( '-cds_bed', help='Bed file of zerofold sites, in form chr\tstart\tstop\gene_id', required=True) parser.add_argument('-out', help='Output file location and name', required=True) parser.add_argument('-sub', help='If specified will submit script to cluster', action='store_true', default=False) parser.add_argument('-evolgen', help='If specified will submit script to lab queue', default=False, action='store_true') args = parser.parse_args() # submission loop if args.sub is True: command_line = [ ' '.join([x for x in sys.argv if x != '-sub' and x != '-evolgen']) ] q_sub(command_line, out=args.out.replace('.txt', '') + 'gene_pi0_pi4', evolgen=args.evolgen, t=48, mem=15, rmem=15) sys.exit() # variables call_fa = pysam.FastaFile(args.call_fa) vcf = pysam.VariantFile(args.vcf) gene_coords = bed_to_dict(args.cds_bed) number_samples = len(vcf.header.samples) out = open(args.out, 'w') # gene by gene calcs print('trans_id', 'pi_indel', 'theta_indel', 'tajd_indel', sep='\t', file=out) for chromosome in gene_coords.keys(): chr_string = call_fa.fetch(chromosome) for trans in gene_coords[chromosome].keys(): call_sites = '' allele_freqs = [] for pos in gene_coords[chromosome][trans]: # get callable site call_pos = chr_string[pos] call_sites += call_pos # get vcf site (try to) var_record = [x for x in vcf.fetch(chromosome, pos, pos + 1)] if len(var_record) == 1: allele_freq = round( var_record[0].info['AC'][0] / float(number_samples * 2), 3) allele_freqs.append(allele_freq) # count callable sites for transcript n_callable = call_sites.upper().count('K') # calc pi if len(allele_freqs) == 0: pie = 0 theta = 0 tajd = 0 else: pie = pi(number_samples, allele_freqs) theta = theta_w(number_samples, len(allele_freqs)) tajd = tajimas_d(number_samples, allele_freqs) if n_callable != 0: pie_per_site = pie / float(n_callable) theta_per_site = theta / float(n_callable) else: pie_per_site, theta_per_site = 0.0, 0.0 print(trans, pie_per_site, theta_per_site, tajd, sep='\t', file=out) out.close()
def __init__(self, snps, fastaFile, filepath_index): self.snps = snps self.faFile = pysam.FastaFile(fastaFile, filepath_index) self.chromosomesWithSNPs = {} self.dico = {'AG':'R', 'CT':'Y', 'GC':'S', 'AT':'W', 'GT':'K', 'AC':'M', 'CGT':'B', 'AGT':'D', 'ACT':'H', 'ACG':'V', 'ACTG':'N'} print 'getTranscriptInformation OK'
def annotate_vcf_n_reads(args): """Entry point to annotate a vcf with read depth and supporting reads.""" ref_fasta = pysam.FastaFile(args.ref_fasta) vcf = VCFReader(args.vcf) chrom = None pref = 'Depth of reads ' suff = ' by strand (fwd, rev)' g_open = 5 g_ext = 3 # use parasail.dnafull (match 5, mismatch -4) # change INFO below if you change this. matrix = parasail.dnafull # check it is indeed a symmetric match = matrix.matrix[0, 0] mismatch = matrix.matrix[0, 1] assert dict( zip(*np.unique(matrix.matrix[:4, :4], return_counts=True))) == { mismatch: 12, match: 4 } assert np.unique(matrix.matrix.diagonal()[:4])[0] == match ann_meta = [ ('INFO', 'DP', 1, 'Integer', pref + 'at pos'), ('INFO', 'DPS', 2, 'Integer', pref + 'at pos' + suff), ('INFO', 'DPSP', 1, 'Integer', pref + 'spanning pos +-{}'.format(args.pad)), ('INFO', 'SR', '.', 'Integer', 'Depth of spanning reads by strand ' + 'which best align to each allele ' + '(ref fwd, ref rev, alt1 fwd, alt1 rev, etc.)'), ('INFO', 'AR', 2, 'Integer', 'Depth of ambiguous spanning reads by ' + 'strand which align equally well to all alleles (fwd, rev)'), ('INFO', 'SC', '.', 'Integer', 'Total alignment score to each allele' + ' of spanning reads by strand ' + '(ref fwd, ref rev, alt1 fwd, alt1 rev, etc.) aligned with parasail' + ' match {}, mismatch {}, open {}, extend {}'.format( match, mismatch, g_open, g_ext)), ] meta_info = vcf.meta + [str(MetaInfo(*m)) for m in ann_meta] with VCFWriter(args.vcfout, 'w', version='4.1', contigs=vcf.chroms, meta_info=meta_info) as vcf_writer: for v in vcf.fetch(): if chrom is None or chrom != v.chrom: chrom = v.chrom ref_seq = ref_fasta.fetch(chrom) # get read depth by strand at the variant (without padding) depth_by_strand = collections.Counter() # medaka.features.get_trimmed_reads seems to behave oddly if the # region only spans 1 base, hence v.pos + 2 var_reg = medaka.common.Region(chrom, v.pos, v.pos + 2) reads = get_trimmed_reads(args.bam, var_reg, partial=True, read_group=args.RG) for is_rev, _ in reads: depth_by_strand[is_rev] += 1 v.info['DP'] = str(sum(depth_by_strand.values())) v.info['DPS'] = '{},{}'.format(depth_by_strand[False], depth_by_strand[True]) # get read depth by strand at the variant (with padding) padded_haps, pad_reg = get_padded_haplotypes(v, ref_seq, args.pad) reads = get_trimmed_reads(args.bam, pad_reg, partial=False, read_group=args.RG) counts, scores = align_reads_to_haps(reads, padded_haps, g_open, g_ext, matrix) v.info['DPSP'] = sum(counts.values()) sr = [] # counts of supporting reads for each hap by strand sc = [] # total scores for each hap by strand haps = list(range(1 + len(v.alt))) # ref and alts is_revs = [False, True] for hap in haps: for is_rev in is_revs: sr.append(counts[(is_rev, hap)]) sc.append(scores[(is_rev, hap)]) v.info['SR'] = ','.join(map(str, sr)) v.info['SC'] = ','.join(map(str, sc)) v.info['AR'] = '{},{}'.format( *[counts[(is_rev, None)] for is_rev in is_revs]) vcf_writer.write_variant(v)
def variants_from_hdf(args): """Entry point for variant calling from HDF5 files. A `LabelScheme` read from HDF must define both a `decode_variants` and `decode_consnesus` method. The latter is used with `join_samples` to detect multi-locus variants spanning `Sample` slice boundaries. """ logger = medaka.common.get_named_logger('Variants') index = medaka.datastore.DataIndex(args.inputs) if args.regions is None: args.regions = index.regions # lookup LabelScheme stored in HDF5 try: label_scheme = index.metadata['label_scheme'] except KeyError: logger.debug("Could not find `label_scheme` metadata in input file, " "assuming HaploidLabelScheme.") label_scheme = medaka.labels.HaploidLabelScheme() logger.debug("Label decoding is:\n{}".format('\n'.join( '{}: {}'.format(k, v) for k, v in label_scheme._decoding.items()))) if not hasattr(label_scheme, 'decode_variants'): raise AttributeError( '{} does not support decoding of variants'.format(label_scheme)) if not hasattr(label_scheme, 'decode_consensus'): raise AttributeError('{} does not support consensus decoding required ' 'for variant calling.'.format(label_scheme)) # tell label_scheme whether we want verbose info fields label_scheme.verbose = args.verbose meta_info = label_scheme.variant_metainfo with pysam.FastaFile(args.ref_fasta) as fa: lengths = dict(zip(fa.references, fa.lengths)) with medaka.vcf.VCFWriter(args.output, 'w', version='4.1', contigs=[ '{},length={}'.format( r.ref_name, lengths[r.ref_name]) for r in args.regions ], meta_info=meta_info) as vcf_writer: for reg in args.regions: logger.info("Processing {}.".format(reg)) ref_seq = pysam.FastaFile( args.ref_fasta).fetch(reference=reg.ref_name).upper() samples = index.yield_from_feature_files([reg]) trimmed_samples = medaka.common.Sample.trim_samples(samples) joined_samples = join_samples(trimmed_samples, ref_seq, label_scheme) for sample in joined_samples: variants = label_scheme.decode_variants( sample, ref_seq, ambig_ref=args.ambig_ref) vcf_writer.write_variants(variants, sort=True)
def main(): args = supply_args() handle_in_vcf = open(args.input, 'rU') handle_out_vcf = open(args.output, 'w') # broke_samp = [] with handle_in_vcf as vcf: for line in vcf: if not line.startswith('#'): new_line = line.rstrip('\n').split('\t') chrom = new_line[0] pos = new_line[1] rsid = new_line[2] ref = new_line[3] alts = new_line[4] qual = new_line[5] filter = new_line[6] info = new_line[7] samples = new_line[9:] if ',' in alts: alt_allele = alts.split(',') genos = geno_prob_parse(len(alt_allele)) for i in range(1, len(alt_allele)+1): # Assess alt allele here, if asterisk. if alt_allele[i-1] == '*': extra_base = pysam.FastaFile(args.ref).fetch(chrom, int(pos)-2, int(pos)-1) pos = str(int(pos) - 1) ref = extra_base + ref alt_allele[i-1] = extra_base to_write = [chrom, pos, rsid, ref] gl_ind = include_gl(genos, i) to_write.append(alt_allele[i-1]) to_write.extend([qual, filter]) to_write.append(info_break(info, i-1)) try: format = new_line[8] to_write.append(format) for sample in samples: broke_samp = sample_break(format, sample) # Work up the SAMPLE section. # GT:DP:AD:RO:QR:AO:QA:GL # 0/1:1206:597,608:597:23045:608:23566:-1753.83,0,-1708.68:0.5045643:0.542 new_samp = [] for field in format.split(':'): if field == 'GT': if 'GL' in broke_samp: new_field = new_gt(broke_samp, gl_ind, 'GL') elif 'PL' in broke_samp: new_field = new_gt(broke_samp, gl_ind, 'PL') else: if broke_samp['GT'] != '1/1': new_field = '0/1' else: new_field = '1/1' elif field == 'AD' or field == 'F1R2' or field == 'F2R1' or field == 'MBQ' or field == 'MFRL': this_samp_ad_ref = broke_samp[field].split(',')[0] this_samp_ad_alt = broke_samp[field].split(',')[i] new_field = ','.join([this_samp_ad_ref, this_samp_ad_alt]) elif field == 'AO' or field == 'QA' or field == 'AF': new_field = broke_samp[field].split(',')[i-1] elif field == 'GL' or field == 'PL': new_field = collect_gls(gl_ind, broke_samp, field) else: new_field = broke_samp[field] new_samp.append(new_field) to_write.append(':'.join(new_samp)) except: print("Something went wrong") handle_out_vcf.write('\t'.join(to_write)) handle_out_vcf.write('\n') else: handle_out_vcf.write(line) else: handle_out_vcf.write(line) handle_out_vcf.close()