def getphastcons(kmerpos, phastconsbed): #kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}} phastconsdict = { } # {age : {location : {binnumber : [phastconsvalue1, phastconsvalue2, ...]}}} #Bin1 is 100 bp upstream of kmerstart. For a 4mer, the kmer would be bins 101-104, and bins 105-205 would be 100 bp downstream of kmerstop. phastconstabix = pysam.Tabixfile(phastconsbed) for age in kmerpos: phastconsdict[age] = {} for location in kmerpos[age]: phastconsdict[age][location] = {} for kmer in kmerpos[age][location]: chrm = kmer[0] kmerstart = int(kmer[1]) kmerstop = int(kmer[2]) strand = kmer[3] phastconsscores = {} # {windowbin : score} if strand == '+': windowstart = kmerstart - 100 windowend = kmerstop + 100 try: for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser=pysam.asBed()): windowbin = str(int(bed.start) - windowstart) phastconsscore = float(bed.name) phastconsscores[windowbin] = phastconsscore except ValueError: print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format( str(chrm), str(kmerstart), str(kmerstop), strand) elif strand == '-': windowstart = kmerstart - 100 windowend = kmerstop + 100 try: for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser=pysam.asBed()): windowbin = str(windowend - int(bed.start)) phastconsscore = float(bed.name) phastconsscores[windowbin] = phastconsscore except ValueError: print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format( str(chrm), str(kmerstart), str(kmerstop), strand) if len( phastconsscores ) > 0: #if there were any bases in the UTR that had phastcons scores for windowbin in phastconsscores: if phastconsdict[age][location].has_key( windowbin) == False: phastconsdict[age][location][windowbin] = [ phastconsscores[windowbin] ] elif phastconsdict[age][location].has_key(windowbin): phastconsdict[age][location][windowbin].append( phastconsscores[windowbin]) return phastconsdict
def getphastcons(phastconsbed, bpprobdict): phastconsdict = { } # {oligoname : {five_prime_offset : [bpprob, phastconsscore]}} #bpprobdict = {} # {oligoname : [{five_prime_offset : bpprob}, [chrm, start, stop, strand]]} phastconstabix = pysam.Tabixfile(phastconsbed) for oligo in bpprobdict: if len( bpprobdict[oligo] ) == 1: #if there was no match for it in the gff (so it has not start/stop coords), skip it continue chrm, start, stop, strand = bpprobdict[oligo][1][0], bpprobdict[oligo][ 1][1], bpprobdict[oligo][1][2], bpprobdict[oligo][1][3] if strand == '+': try: for bed in phastconstabix.fetch(chrm, start - 1, stop - 1, parser=pysam.asBed()): if not (bed.start >= start and bed.end <= stop): continue phastconsscore = float(bed.name) fpo = int( (bed.start - start) + 4 ) #the first nt of the gff region is five_prime_offset 4 #print chrm, start, stop, bed.start, bed.end, fpo if phastconsdict.has_key(oligo) == False: phastconsdict[oligo] = {} if phastconsdict[oligo].has_key( fpo ) == False and fpo >= 10 and fpo <= 90: #discount anything near a splice site phastconsdict[oligo][fpo] = [] bpprob = bpprobdict[oligo][0][fpo] phastconsdict[oligo][fpo] = [bpprob, phastconsscore] except ValueError: print 'WARNING: problem with {0}.'.format(oligo) elif strand == '-': try: for bed in phastconstabix.fetch(chrm, start - 1, stop - 1, parser=pysam.asBed()): if not (bed.start >= start and bed.end <= stop): continue phastconsscore = float(bed.name) fpo = int((stop - bed.start) + 3) #print chrm, start, stop, bed.start, bed.end, fpo if phastconsdict.has_key(oligo) == False: phastconsdict[oligo] = {} if phastconsdict[oligo].has_key( fpo ) == False and fpo >= 10 and fpo <= 90: #discount anything near a splice site phastconsdict[oligo][fpo] = [] bpprob = bpprobdict[oligo][0][fpo] phastconsdict[oligo][fpo] = [bpprob, phastconsscore] except ValueError: print 'WARNING: problem with {0}.'.format(oligo) return phastconsdict
def main(): print(pileup_file) # cycle through each gene in region file for region_file in os.listdir(region_dir): region_bed = open(region_dir + region_file, 'r') for line in region_bed: line = line.rstrip().split() try: line_chrm = int(line[0]) except ValueError: line_chrm = line[0] line_start = int(line[1]) line_end = int(line[2]) # cycle through each VCF per gene for vcf_file in os.listdir(VCF_dir): # make sure to only target the proper vcf file if ".gz" in vcf_file and ".tbi" not in vcf_file: vcf_tbx = pysam.TabixFile(VCF_dir + vcf_file) for vcf_hit in vcf_tbx.fetch(line_chrm, line_start, line_end, parser=pysam.asBed()): # store interesting values from vcf file try: vcf_chrm = int(vcf_hit[0]) except ValueError: vcf_chrm = vcf_hit[0] vcf_pos = int(vcf_hit[1]) vcf_ref = vcf_hit[3] vcf_alt = vcf_hit[4] vcf_genotype = vcf_hit[9].split(':')[0] # convert '0/1' to ['A','G'] allele_genotype = get_allele_genotype( vcf_genotype, vcf_ref, vcf_alt) # find matching hits in pileup file pileup_tbx = pysam.TabixFile(pileup_file) for pileup_hit in pileup_tbx.fetch(vcf_chrm, vcf_pos, vcf_pos + 1, parser=pysam.asBed()): pileup_ref = pileup_hit[3] pileup_read_count = float(pileup_hit[4]) pileup_base_read = pileup_hit[5] ref_matches = find_ref_matches(pileup_base_read) alt_matches = find_alt_matches(pileup_base_read, allele_genotype) try: match_contribution = float(ref_matches + alt_matches) / float( pileup_read_count) except ZeroDivisionError: continue print( str(vcf_chrm) + '\t' + str(vcf_pos) + '\t' + str(vcf_pos + 1) + '\t' + str(match_contribution) + '\t' + str(region_file.split('.')[0]))
def slidingwindowmedian(coords, tbx): #Given a set of exonic coordinates (could be one or more exons), slide a window across the joined exons, #taking the median score of that window and recording it. #coords is the value of exoniccoords for one gene key #e.g. {chr2 : [[123123, 131232], [134343, 145223]]} medianwindowscores = [] chrm = list(coords.keys())[0] joinedexoncoords = [] #every nt that is exonic, joined together windowsize = 100 slidesize = 20 for exon in coords[chrm]: joinedexoncoords += list(range(exon[0], exon[1] + 1)) if len(joinedexoncoords) < windowsize: scores = [] for coord in joinedexoncoords: for row in tbx.fetch(chrm, coord, coord + 1, parser=pysam.asBed()): scores.append(float(row.score)) medianscore = np.mean(scores) return [medianscore] elif len(joinedexoncoords) >= windowsize: currentind = 0 for ind, coord in enumerate(joinedexoncoords): while currentind + windowsize <= len(joinedexoncoords): currentwindowcoords = joinedexoncoords[currentind:currentind + windowsize] #print('Current window: {0}, {1}'.format(currentwindowcoords[0], currentwindowcoords[-1])) #OK now break this window up into chunks of consecutive integers #https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list consecutivechunks = [] for k, g in groupby(enumerate(currentwindowcoords), lambda ix: ix[0] - ix[1]): consecutivechunk = list(map(itemgetter(1), g)) consecutivechunks.append( [consecutivechunk[0], consecutivechunk[-1]]) currentwindowscores = [] for chunk in consecutivechunks: #print('Current chunk: {0}, {1}'.format(chunk[0], chunk[-1])) for row in tbx.fetch(chrm, chunk[0], chunk[-1], parser=pysam.asBed()): currentwindowscores.append(float(row.score)) if len( currentwindowscores ) == 0: #if there were no coords in this window that had a score medianwindowscores.append('NA') elif currentwindowscores: medianwindowscore = np.mean(currentwindowscores) medianwindowscores.append(medianwindowscore) currentind += slidesize return medianwindowscores
def iterate_bed(bed_file, merge_intervals): if merge_intervals: contig, start, end = None, None, None for bed in bed_file.fetch(parser=pysam.asBed()): if contig != bed.contig: if contig is not None: yield contig, start, end contig = bed.contig start, end = bed.start, bed.end end = bed.end yield contig, start, end else: for bed in bed_file.fetch(parser=pysam.asBed()): yield bed.contig, bed.start, bed.end
def _run(self, _config, temp): def keyfunc(bed): return (bed.contig, bed.name, bed.start) fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._intervals) as bedfile: intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items() for (contig, beds) in sorted(intervals): beds.sort(key=keyfunc) for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name): gene_beds = tuple(gene_beds) for bed in gene_beds: seqs[(contig, gene)].append( fastafile.fetch(contig, bed.start, bed.end)) seq = "".join(seqs[(contig, gene)]) if any((bed.strand == "-") for bed in gene_beds): assert all((bed.strand == "-") for bed in gene_beds) seq = sequences.reverse_complement(seq) seqs[(contig, gene)] = seq temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): fasta.print_fasta(gene, sequence, out_file) move_file(temp_file, self._outfile)
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in data_per_contig.keys(): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in data_per_contig.iteritems(): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def _stat_areas_of_interest(cls, prefixes): """Returns (size, number of named intervals, total number of intervals) for a set of areas of interest.""" areas_of_interest = {} for (prefix_name, prefix) in prefixes.iteritems(): prefix_label = prefix.get("Label", prefix_name) for (aoi_name, aoi_filename) in prefix.get("AreasOfInterest", {}).iteritems(): count, names, size = 0, set(), 0 with open(aoi_filename) as handle: parser = pysam.asBed() for line in handle: bed = parser(line, len(line)) names.add(bed.name if len(bed) >= 4 else (bed.contig + "*")) size += (bed.end - bed.start) count += 1 areas_of_interest[(prefix_name, aoi_name)] = { "Size": size, "NFeatures": len(names), "NIntervals": count, "Genome": prefix["Name"], "Name": aoi_name, "Label": "%s:%s" % (prefix_label, aoi_name), "Path": aoi_filename } return areas_of_interest
def main(): pileup_tbx = pysam.TabixFile(pileup_file) # cycle through each gene in region file for region_file in os.listdir(region_dir): region_bed = open(region_dir + region_file, 'r') for line in region_bed: line = line.rstrip().split() try: line_chrm = int(line[0]) except ValueError: line_chrm = line[0] line_start = int(line[1]) line_end = int(line[2]) for pileup_hit in pileup_tbx.fetch(line_chrm, line_start, line_end, parser=pysam.asBed()): pileup_chrm = pileup_hit[0] pileup_start = pileup_hit[1] pileup_end = pileup_hit[2] pileup_base_read = pileup_hit[6] quality_score = calculate_quality(pileup_base_read) print( str(pileup_chrm) + '\t' + str(pileup_start) + '\t' + str(pileup_end) + '\t' + str(quality_score) + '\t' + str(region_file.split('.')[0]))
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in list(data_per_contig.keys()): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in sorted(data_per_contig.items()): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--genotype", help="Tabix indexed pileup file.", required=True) parser.add_argument("--intervals", help="BED file.", required=True) parser.add_argument("--padding", type=int, default=10, help="Number of bases to expand intervals, when " "filtering based on adjacent indels [%default]") parser.add_argument("--min-distance-to-indels", type=int, default=5, help="Variants closer than this distance from indels " "are filtered [%default].") args = parser.parse_args(argv) genotype = pysam.Tabixfile(args.genotype) with open(args.intervals) as bed_file: intervals = text.parse_lines_by_contig(bed_file, pysam.asBed()) for (_, beds) in sorted(intervals.items()): for (name, sequence) in build_genes(args, genotype, beds): FASTA(name, None, sequence).write(sys.stdout) return 0
def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out): digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] bed.start = frag.start bed.end = frag.end bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def load_gap_intervals(gap_file): if gap_file is None: return [] logger.info("Loading the gaps in the genome from %s" % gap_file) with open(gap_file) as gap_file_fd: gap_intervals = [SVInterval(it.contig, it.start, it.end, it.name, "gap") for it in pysam.tabix_file_iterator(gap_file_fd, parser=pysam.asBed())] return merge_intervals(gap_intervals)
def intersect(gff, tbx): phastconsvalues = [] coords = [] #nested list of [chrm, start, stop] for each line in tempgff utrexonlengths = [] #lengths of all utr exons with open(gff, 'r') as infh: for line in infh: line = line.strip().split('\t') chrm, start, stop = line[0], int(line[3]), int(line[4]) coords.append([chrm, start, stop]) for coord in coords: utrexonlengths.append(coord[2] - coord[1]) for row in tbx.fetch(coord[0], coord[1], coord[2], parser=pysam.asBed()): score = float(row.score) phastconsvalues.append(score) medphastcons = np.median(phastconsvalues) #Check to see if we had scores for at least some fraction of the exonic nt utrlength = sum(utrexonlengths) if len(phastconsvalues) >= (utrlength * 0.5): return medphastcons else: return None
def _run(self, _config, temp): def keyfunc(bed): return (bed.contig, bed.name, bed.start) fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._intervals) as bedfile: intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items() for (contig, beds) in sorted(intervals): beds.sort(key = keyfunc) for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name): gene_beds = tuple(gene_beds) for bed in gene_beds: seqs[(contig, gene)].append(fastafile.fetch(contig, bed.start, bed.end)) seq = "".join(seqs[(contig, gene)]) if any((bed.strand == "-") for bed in gene_beds): assert all((bed.strand == "-") for bed in gene_beds) seq = sequences.reverse_complement(seq) seqs[(contig, gene)] = seq temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): fasta.print_fasta(gene, sequence, out_file) move_file(temp_file, self._outfile)
def _bed_getter(bedfile, contig, start=0, end=None, strand=".", dtype="uint16"): '''Get crosslink profiles from tabix indexed bedGraph/Bed''' # check the file contains some data for the requested contig if not contig in bedfile.contigs: #print "%s not in bedfile" % contig return pd.Series(dict(), dtype=dtype) # fetch the rercords from the specificed region crosslinks = bedfile.fetch(contig, start, end, parser=pysam.asBed()) profile = dict() check_sum = 0 for base in crosslinks: try: correct_strand = strand == "." or base.strand == strand except AttributeError: correct_strand = True if correct_strand: profile[float(base.start)] = int(base.score) check_sum += int(base.score) profile = pd.Series(dict(profile), dtype=dtype) #if not check_sum == profile.sum(): # raise OverflowError("Check sum failed (%i = %i). Possibly counts exceed specified dtype. Use bigger dtype" # % (check_sum, profile.sum())) return profile
def getProbeFragments(probe_bed, digest_bed, outfile, lookup_out): # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates # from the pregenerated file. # First iteration, no comparison first_iteration = True length_RE_cut = 0 last_bed = None for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)): if(first_iteration): first_iteration = False else: # If they are in the same contig they can be compared if(bed_digest.contig == last_bed.contig): length_RE_cut = bed_digest.start - last_bed.end break last_bed = bed_digest digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] # The restriction enzyme cut on the left side of the fragment # is the end site of the last restriction enzyme fragment + 1 # (+1 because according to the manual coordinates are specified # in 1-origin for the bed start.) bed.start = frag.start-length_RE_cut+1 bed.end = frag.end+length_RE_cut bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def generate_from_bed(bam_file, bed_file, stepper="nofilter"): for bed in bed_file.fetch(parser=pysam.asBed()): for v in bam_file.pileup(bed.contig, bed.start, bed.end, stepper=stepper, truncate=True): yield v
def getoligoscore(tbx, chrm, start, stop): scores = [] for row in tbx.fetch(chrm, start, stop, parser=pysam.asBed()): score = float(row.score) scores.append(score) return len(scores), scores
def generate_from_bed(bam_file, bed_file, **kwargs): for bed in bed_file.fetch(parser=pysam.asBed()): for v in bam_file.pileup(bed.contig, bed.start, bed.end, **kwargs, truncate=True): yield v
def filter_bam(args, bcd): with open(args.output, 'w') as o: with gzip.open(args.fragments) as f: tbx = pysam.tabix_iterator(f, pysam.asBed()) for line in tbx: if line.name in bcd: o.write("{}\n".format(str(line))) return 0
def __init__(self, filename, **kwargs): filename = str(filename) if not filename.endswith('.gz'): if os.path.exists(filename + '.gz'): filename += '.gz' else: filename = self.compress(filename, create_index=True) super().__init__(filename, parser=pysam.asBed(), **kwargs)
def __init__(self, file_path): file_path = str(file_path) if not file_path.endswith('.gz'): if os.path.exists(file_path + '.gz'): file_path += '.gz' else: file_path = self.compress(file_path) super().__init__(file_path, parser=pysam.asBed())
def testRead( self ): for x, r in enumerate(self.tabix.fetch( parser = pysam.asBed() )): c = self.compare[x] self.assertEqual( "\t".join( c ), str(r) ) self.assertEqual( list(c), list(r) ) self.assertEqual( c[0], r.contig) self.assertEqual( int(c[1]), r.start) self.assertEqual( int(c[2]), r.end)
def testRead(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asBed())): c = self.compare[x] self.assertEqual("\t".join(c), str(r)) self.assertEqual(list(c), list(r)) self.assertEqual(c[0], r.contig) self.assertEqual(int(c[1]), r.start) self.assertEqual(int(c[2]), r.end)
def threeUTRmetagene(gff, clusters): gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn) db = gffutils.FeatureDB(db_fn) clustertabix = pysam.Tabixfile(clusters) number_of_UTRs = float(len(list(db.features_of_type('3\'UTR')))) three_prime_UTRs = db.features_of_type('3\'UTR') binhits = {} bindensities = {} for UTR in three_prime_UTRs: binnumber = 1 lowerpercent = 0 upperpercent = 1 UTRlength = float((UTR.stop - UTR.start)) while upperpercent <= 100: nt_window = [ int(round((UTRlength / 100) * lowerpercent)), int(round((UTRlength / 100) * upperpercent)) ] windowstart = nt_window[0] windowstop = nt_window[1] #For every time a cluster bed entry overlaps this particular sequence window for bed in clustertabix.fetch(str(UTR.chrom), UTR.start + windowstart, UTR.start + windowstop, parser=pysam.asBed()): if UTR.strand == bed.strand: if binhits.has_key(binnumber): binhits[binnumber] += 1 else: binhits[binnumber] = 1 lowerpercent += 1 upperpercent += 1 binnumber += 1 os.remove(db_fn) #Number of hits in every unpopulated bin is 0 for x in range(1, 101): if x not in binhits: binhits[x] = 0 for binnumber in binhits: rawhits = binhits[binnumber] density = float(rawhits / number_of_UTRs) bindensities[binnumber] = density return bindensities
def locate_fragments(data: Union[AnnData, MuData], fragments: str, return_fragments: bool = False): """ Parse fragments file and add a variable to access it to the .uns["files"]["fragments"] Fragments file is never read to memory, and connection to the file is closed upon function completion. Parameters ---------- data AnnData object with peak counts or multimodal MuData object with 'atac' modality. fragments A path to the compressed tab-separated fragments file (e.g. atac_fragments.tsv.gz). return_fragments If return the Tabix connection the fragments file. False by default. """ frag = None try: if isinstance(data, AnnData): adata = data elif isinstance(data, MuData) and "atac" in data.mod: adata = data.mod["atac"] else: raise TypeError( "Expected AnnData or MuData object with 'atac' modality") try: import pysam except ImportError: raise ImportError( "pysam is not available. It is required to work with the fragments file. \ Install pysam from PyPI (`pip install pysam`) \ or from GitHub (`pip install git+https://github.com/pysam-developers/pysam`)" ) # Here we make sure we can create a connection to the fragments file frag = pysam.TabixFile(fragments, parser=pysam.asBed()) if "files" not in adata.uns: adata.uns["files"] = OrderedDict() adata.uns["files"]["fragments"] = fragments if return_fragments: return frag except Exception as e: print(e) finally: if frag is not None and not return_fragments: # The connection has to be closed frag.close()
def main(path_input): # read the original fragments file df = pd.read_csv(path_input, sep="\t", header=None, names=["chr", "start", "end", "cb", "counts"]) # generate a table describing each chromosome's length chrs = "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY chrM" ends = "197195432 181748087 159599783 155630120 152537259 149517037 152524553 131738871 124076172 129993255 121843856 121257530 120284312 125194864 103494974 98319150 95272651 90772031 61342430 166650296 15902555 16299" chrs = chrs.split(" ") ends = ends.split(" ") df_contigs = pd.DataFrame({"chromosome": chrs, "len": ends}) df_contigs.len = df_contigs.len.astype(int) # find problematic fragments that exceed contig size (due to cell ranger bug) problematics = [] with pysam.Tabixfile(path_input) as tbx: for i, contig in df_contigs[ df_contigs.chromosome != "chrM"].iterrows(): for row in tbx.fetch(contig.chromosome, contig.len - 1, contig.len, parser=pysam.asBed()): problematics.append(row) # write to problematics.csv df_problematics = pd.DataFrame( map(lambda item: (item.contig, item.start, item.end, item.name), problematics)) df_problematics.to_csv("problematics.csv", sep="\t", header=False, index=False) # iterate through problematic fragments and correct the coordinates for p in bar(problematics): # retrieve the proper end of the contig proper_end = df_contigs.loc[df_contigs.chromosome == p.contig, "len"].values[0] # correct df.loc[(df.chr == p.contig) & (df.start == p.start) & (df.cb == p.name), "end"] = proper_end # write to disk df.to_csv("fragments.tsv", sep="\t", header=False, index=False)
def _collect_and_validate_regions(regions): contigs = _collect_fasta_contigs(regions) parser = pysam.asBed() sequences = set() with open(regions["BED"]) as bedhandle: for (line_num, line) in enumerate(bedhandle): line = line.strip() if not line or line.startswith("#"): continue try: bed = parser(line, len(line)) # Force evaluation of (lazily parsed) properties bed_start = bed.start bed_end = bed.end except ValueError, error: raise MakefileError(("Error parsing line %i in regions file:\n" " Path = %r\n Line = %r\n\n%s") % (line_num + 1, regions["BED"], line, error)) if len(bed) < 6: url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1" name = repr(bed.name) if len(bed) > 3 else "unnamed record" raise MakefileError(("Region at line #%i (%s) does not " "contain the expected number of fields; " "the first 6 fields are required. C.f. " "defination at\n %s\n\nPath = %r") % (line_num, name, url, regions["BED"])) contig_len = contigs.get(bed.contig) if contig_len is None: raise MakefileError(("Regions file contains contig not found " "in reference:\n Path = %r\n Contig = " "%r\n\nPlease ensure that all contig " "names match the reference names!") % (regions["BED"], bed.contig)) elif not (0 <= int(bed_start) < int(bed_end) <= contig_len): raise MakefileError(("Regions file contains invalid region:\n" " Path = %r\n Contig = %r\n" " Start = %s\n End = %s\n\n" "Expected 0 <= Start < End <= %i!") % (regions["BED"], bed.contig, bed.start, bed.end, contig_len)) elif bed.strand not in "+-": raise MakefileError(("Regions file contains invalid region: " " Path = %r\n Line = %i\n Name = %r" "\nStrand is %r, expected '+' or '-'.") % (regions["BED"], line_num, bed.name, bed.strand)) sequences.add(bed.name)
def _collect_and_validate_regions(regions): contigs = _collect_fasta_contigs(regions) parser = pysam.asBed() sequences = set() with open(regions["BED"]) as bedhandle: for (line_num, line) in enumerate(bedhandle): line = line.strip() if not line or line.startswith("#"): continue try: bed = parser(line, len(line)) # Force evaluation of (lazily parsed) properties bed_start = bed.start bed_end = bed.end except ValueError, error: raise MakefileError( ("Error parsing line %i in regions file:\n" " Path = %r\n Line = %r\n\n%s") % (line_num + 1, regions["BED"], line, error)) if len(bed) < 6: url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1" name = repr(bed.name) if len(bed) > 3 else "unnamed record" raise MakefileError(("Region at line #%i (%s) does not " "contain the expected number of fields; " "the first 6 fields are required. C.f. " "defination at\n %s\n\nPath = %r") % (line_num, name, url, regions["BED"])) contig_len = contigs.get(bed.contig) if contig_len is None: raise MakefileError(("Regions file contains contig not found " "in reference:\n Path = %r\n Contig = " "%r\n\nPlease ensure that all contig " "names match the reference names!") % (regions["BED"], bed.contig)) elif not (0 <= int(bed_start) < int(bed_end) <= contig_len): raise MakefileError(("Regions file contains invalid region:\n" " Path = %r\n Contig = %r\n" " Start = %s\n End = %s\n\n" "Expected 0 <= Start < End <= %i!") % (regions["BED"], bed.contig, bed.start, bed.end, contig_len)) elif bed.strand not in "+-": raise MakefileError( ("Regions file contains invalid region: " " Path = %r\n Line = %i\n Name = %r" "\nStrand is %r, expected '+' or '-'.") % (regions["BED"], line_num, bed.name, bed.strand)) sequences.add(bed.name)
def getKmerPhastcons(fastadict, k, phastconsbed): k = int(k) phastconsdict = {} #{kmer:mean_phastcons_score_of_bases_in_kmer} phastconsaveragedict = {} #{kmer:mean of scores in phastconsdict} UTRcounter = 0 phastconstabix = pysam.Tabixfile(phastconsbed) for UTR in fastadict: UTRcounter += 1 if UTRcounter % 50 == 0: sys.stderr.write( 'Determining motif conservation in UTR {0} of {1}...\n'.format( UTRcounter, len(fastadict))) UTRsequence = fastadict[UTR] UTR = UTR.replace(';', '\t').split('\t') ID = UTR[0] chrm = UTR[1] start = int(UTR[2]) stop = int(UTR[3]) strand = UTR[4] for i in range(len(UTRsequence) - k + 1): if strand == '+': mousekmer = UTRsequence[i:i + k] mousekmerstart = start + i mousekmerstop = start + i + k - 1 elif strand == '-': mousekmer = UTRsequence[i:i + k] mousekmerstart = stop - i - k + 1 mousekmerstop = stop - i kmerscores = [ ] #list of phastcons scores for every bp of this kmer #Remember...input gff coords are 1-based and the phastconsbed is 0-based for bed in phastconstabix.fetch(str(chrm), mousekmerstart - 1, mousekmerstop, parser=pysam.asBed()): kmerscores.append(float(bed.name)) if len(kmerscores) == k: #if every base in the kmer had a score kmeraveragescore = (sum(kmerscores) / float(len(kmerscores))) if phastconsdict.has_key(mousekmer) == False: phastconsdict[mousekmer] = [ kmeraveragescore ] #if kmer not in dictionary, initialize entry elif phastconsdict.has_key(mousekmer): phastconsdict[mousekmer].append(kmeraveragescore) elif len(kmerscores ) != k: #if not every base in the kmer had a score continue for kmer in phastconsdict: phastconsaveragedict[kmer] = np.mean(phastconsdict[kmer]) return phastconsaveragedict
def read_bed_records(filename): """Reads a bed-file (i.e. for a set of regions of interest), and returns a sorted list containing each line as a tuple containing the contig name, the start position, and the end position.""" regions = [] bed_parser = pysam.asBed() with open(filename) as bed_file: for line in bed_file: line = line.strip() if not line or line.startswith('#'): continue regions.append(bed_parser(line, len(line))) return regions
def filter_bam(args,bcd): reads = {} replicate = {} with gzip.open(args.fragments) as f: tbx = pysam.tabix_iterator(f,pysam.asBed()) for line in tbx: if line.name in bcd: try: reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]].append(str(line)) except KeyError: reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]] = [str(line)] return(reads)
def get_reads(tbx, chrm, start, end): reads = [] # pull out the quality scores for row in tbx.fetch(chrm, start, end, parser=pysam.asBed()): c = row[0] s = row[1] e = row[2] r = row[5] cse = [c, s, e] curr_row = [cse, r] reads.append(curr_row) #for r in reads: print (r) return reads
def combineUnmergedIntervals(foreground, background): '''combine intervals in a collection of bed files. Only intervals in the first track are reported. Algorithm: 1. report all intervals in the first track that overlap with an interval in every other track. ''' intervals = [] c = 0 for bed in foreground.fetch(parser=pysam.asBed()): c += 1 if isContainedInAll(bed.contig, bed.start, bed.end, background): yield bed
def getphastcons(kmerpos, phastconsbed, outfile, protein, RBNSstate): #kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}} phastconsdict = { } # {age : {location : [meanphastcons of oligo1 around motif, meanphastcons of oligo2 around motif]}} phastconstabix = pysam.Tabixfile(phastconsbed) for age in kmerpos: phastconsdict[age] = {} for location in kmerpos[age]: phastconsdict[age][location] = [] for kmer in kmerpos[age][location]: chrm = kmer[0] kmerstart = int(kmer[1]) kmerstop = int(kmer[2]) strand = kmer[3] phastconsscores = [] windowstart = kmerstart - 25 windowend = kmerstop + 25 try: for bed in phastconstabix.fetch(chrm, windowstart, windowend, parser=pysam.asBed()): phastconsscore = float(bed.name) phastconsscores.append(phastconsscore) except ValueError: print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format( str(chrm), str(kmerstart), str(kmerstop), strand) if len( phastconsscores ) > 0: #if there were any bases in the region that had phastcons scores meanphastcons = mean(phastconsscores) phastconsdict[age][location].append(meanphastcons) if not os.path.isfile(outfile): with open(outfile, 'w') as f: f.write(('\t').join( ['age', 'location', 'protein', 'RBNSstate', 'meanphastcons']) + '\n') for age in phastconsdict: for location in phastconsdict[age]: for score in phastconsdict[age][location]: with open(outfile, 'a') as f: f.write(('\t').join( [age, location, protein, RBNSstate, str(score)]) + '\n')
def _bed_getter(bedfile, contig, start=0, end=None, strand=".", dtype="uint16"): '''Get crosslink profiles from tabix indexed bedGraph/Bed''' # check the file contains some data for the requested contig if not contig in bedfile.contigs: #print "%s not in bedfile" % contig return pd.Series(dict(), dtype=dtype) # fetch the rercords from the specificed region crosslinks = bedfile.fetch(contig, start, end, parser=pysam.asBed()) profile = dict() check_sum = 0 for base in crosslinks: try: correct_strand = strand == "." or base.strand == strand except AttributeError: correct_strand = True except KeyError: correct_strand = True if correct_strand: try: profile[float(base.start)] = int(base.score) check_sum += int(base.score) except AttributeError: profile[float(base.start)] = 1 check_sum += 1 except KeyError: profile[float(base.start)] = 1 check_sum += 1 if len(profile.keys())==0: profile = pd.Series(profile, dtype=dtype, index=pd.Index([], dtype="float")) profile = pd.Series(dict(profile), dtype=dtype) #if not check_sum == profile.sum(): # raise OverflowError("Check sum failed (%i = %i). Possibly counts exceed specified dtype. Use bigger dtype" # % (check_sum, profile.sum())) return profile
def read_bed_file(filename): """Parses a (gzip/bzip2 compressed) BED file, and yields a sequence of records. Comments and empty lines are skipped.""" handle = None try: handle = fileutils.open_ro(filename) parser = pysam.asBed() for record in text.parse_lines(handle, parser): # Force evaluation of (lazily parsed) properties _ = record.start _ = record.end yield record finally: if handle: handle.close()
def testWrite(self): for x, r in enumerate(self.tabix.fetch(parser=pysam.asBed())): c = self.compare[x] self.assertEqual(c, str(r).split("\t")) self.assertEqual(list(c), list(r)) r.contig = "test" self.assertEqual("test", r.contig) self.assertEqual("test", r[0]) r.start += 1 self.assertEqual(int(c[1]) + 1, r.start) self.assertEqual(str(int(c[1]) + 1), r[1]) r.end += 1 self.assertEqual(int(c[2]) + 1, r.end) self.assertEqual(str(int(c[2]) + 1), r[2])
def _get_hits(coords, annotation, parser_type): """Retrieve BED information, recovering if BED annotation file does have a chromosome. """ if parser_type == "bed": parser = pysam.asBed() elif parser_type == "vcf": parser = pysam.asVCF() elif parser_type == "tuple": parser = pysam.asTuple() elif parser_type is None: parser = None else: raise ValueError("Unexpected parser type: %s" % parser) chrom, start, end = coords try: hit_iter = annotation.fetch(str(chrom), start, end, parser=parser) # catch invalid region errors raised by ctabix except ValueError: hit_iter = [] return hit_iter
def _run(self, _config, temp): def _by_name(bed): return bed.name fastafile = pysam.Fastafile(self._reference) seqs = collections.defaultdict(list) with open(self._bedfile) as bedfile: bedrecords = text.parse_lines_by_contig(bedfile, pysam.asBed()) for (contig, beds) in sorted(bedrecords.iteritems()): beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start)) for (gene, gene_beds) in itertools.groupby(beds, _by_name): gene_beds = tuple(gene_beds) sequence = self._collect_sequence(fastafile, gene_beds) seqs[(contig, gene)] = sequence temp_file = os.path.join(temp, "sequences.fasta") with open(temp_file, "w") as out_file: for ((_, gene), sequence) in sorted(seqs.items()): FASTA(gene, None, sequence).write(out_file) fileutils.move_file(temp_file, self._outfile)
def _stat_areas_of_interest(cls, prefixes): """Returns (size, number of named intervals, total number of intervals) for a set of areas of interest.""" areas_of_interest = {} for (prefix_name, prefix) in prefixes.iteritems(): prefix_label = prefix.get("Label", prefix_name) for (roi_name, roi_filename) in prefix.get("RegionsOfInterest", {}).iteritems(): count, names, size = 0, set(), 0 with open(roi_filename) as handle: parser = pysam.asBed() for line in handle: bed = parser(line, len(line)) names.add(bed.name if len(bed) >= 4 else (bed.contig + "*")) size += (bed.end - bed.start) count += 1 areas_of_interest[(prefix_name, roi_name)] = {"Size" : size, "NFeatures" : len(names), "NIntervals" : count, "Genome" : prefix["Name"], "Name" : roi_name, "Label" : "%s:%s" % (prefix_label, roi_name), "Path" : roi_filename} return areas_of_interest
def read_intervals(filename): with open(filename) as bed_file: intervals = text.parse_lines_by_contig(bed_file, pysam.asBed()) for (key, beds) in intervals.iteritems(): bed_tuples = [] for bed in beds: if len(bed) < 6: sys.stderr.write(("ERROR: Invalid BED record '%s', must " "have at least 6 fields ...\n") % ("\\t".join(bed),)) return None # Transform to a named tuple, as Pysam has a tendency to # segfault if you do anything wrong bed = list(bed)[:6] # BED6 only bed[1] = int(bed[1]) # start bed[2] = int(bed[2]) # end bed[4] = int(bed[4]) # score bed_tuples.append(BEDTuple(*bed)) intervals[key] = bed_tuples return intervals
def fetch_parsed(fn): with pysam.Tabixfile(fn) as f: return len(list(f.fetch(parser=pysam.asBed())))
def test_fetch_parsed(): """Stupid test function""" f = pysam.Tabixfile(fn_compressed) l = len( list(f.fetch( parser = pysam.asBed())) )
def iterate_file_uncompressed(fn): with open(fn) as f: return len(list(pysam.tabix_file_iterator(f, parser=pysam.asBed())))
def iterate_parsed_compressed(fn): with gzip.open(fn) as f: return len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
from bx.bbi.bigwig_file import BigWigFile annos[anno] = BigWigFile(open(anno_files[anno])) except IOError: sys.exit("Gemini cannot open this annotation file: %s. \n" "Have you installed the annotation files? If so, " "have they been moved or deleted? Exiting...\n\n" "For more details:\n\t" "http://gemini.readthedocs.org/en/latest/content/" "#installation.html\#installing-annotation-files\n" % anno_files[anno]) # ## Standard access to Tabix indexed files PARSERS = {"bed": pysam.asBed(), "vcf": pysam.asVCF(), "tuple": pysam.asTuple(), None: None} def _get_hits(coords, annotation, parser_type, _parsers=PARSERS): """Retrieve BED information, recovering if BED annotation file does have a chromosome. """ try: parser = _parsers[parser_type] except KeyError: raise ValueError("Unexpected parser type: %s" % parser) chrom, start, end = coords try: hit_iter = annotation.fetch(str(chrom), start, end, parser=parser) # catch invalid region errors raised by ctabix
def test_iterator_file_uncompressed(): f = open("windows_small.bed") l = len( list( pysam.tabix_file_iterator( f, parser = pysam.asBed() )))
def test_iterator_parsed_compressed(): f = gzip.open(fn_compressed) l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() )))