def getphastcons(kmerpos, phastconsbed):
    #kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}}
    phastconsdict = {
    }  # {age : {location : {binnumber : [phastconsvalue1, phastconsvalue2, ...]}}}
    #Bin1 is 100 bp upstream of kmerstart. For a 4mer, the kmer would be bins 101-104, and bins 105-205 would be 100 bp downstream of kmerstop.
    phastconstabix = pysam.Tabixfile(phastconsbed)
    for age in kmerpos:
        phastconsdict[age] = {}
        for location in kmerpos[age]:
            phastconsdict[age][location] = {}
            for kmer in kmerpos[age][location]:
                chrm = kmer[0]
                kmerstart = int(kmer[1])
                kmerstop = int(kmer[2])
                strand = kmer[3]
                phastconsscores = {}  # {windowbin : score}
                if strand == '+':
                    windowstart = kmerstart - 100
                    windowend = kmerstop + 100
                    try:
                        for bed in phastconstabix.fetch(chrm,
                                                        windowstart,
                                                        windowend,
                                                        parser=pysam.asBed()):
                            windowbin = str(int(bed.start) - windowstart)
                            phastconsscore = float(bed.name)
                            phastconsscores[windowbin] = phastconsscore
                    except ValueError:
                        print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(
                            str(chrm), str(kmerstart), str(kmerstop), strand)

                elif strand == '-':
                    windowstart = kmerstart - 100
                    windowend = kmerstop + 100
                    try:
                        for bed in phastconstabix.fetch(chrm,
                                                        windowstart,
                                                        windowend,
                                                        parser=pysam.asBed()):
                            windowbin = str(windowend - int(bed.start))
                            phastconsscore = float(bed.name)
                            phastconsscores[windowbin] = phastconsscore
                    except ValueError:
                        print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(
                            str(chrm), str(kmerstart), str(kmerstop), strand)

                if len(
                        phastconsscores
                ) > 0:  #if there were any bases in the UTR that had phastcons scores
                    for windowbin in phastconsscores:
                        if phastconsdict[age][location].has_key(
                                windowbin) == False:
                            phastconsdict[age][location][windowbin] = [
                                phastconsscores[windowbin]
                            ]
                        elif phastconsdict[age][location].has_key(windowbin):
                            phastconsdict[age][location][windowbin].append(
                                phastconsscores[windowbin])

    return phastconsdict
def getphastcons(phastconsbed, bpprobdict):
    phastconsdict = {
    }  # {oligoname : {five_prime_offset : [bpprob, phastconsscore]}}
    #bpprobdict = {} # {oligoname : [{five_prime_offset : bpprob}, [chrm, start, stop, strand]]}
    phastconstabix = pysam.Tabixfile(phastconsbed)
    for oligo in bpprobdict:
        if len(
                bpprobdict[oligo]
        ) == 1:  #if there was no match for it in the gff (so it has not start/stop coords), skip it
            continue
        chrm, start, stop, strand = bpprobdict[oligo][1][0], bpprobdict[oligo][
            1][1], bpprobdict[oligo][1][2], bpprobdict[oligo][1][3]
        if strand == '+':
            try:
                for bed in phastconstabix.fetch(chrm,
                                                start - 1,
                                                stop - 1,
                                                parser=pysam.asBed()):
                    if not (bed.start >= start and bed.end <= stop):
                        continue
                    phastconsscore = float(bed.name)
                    fpo = int(
                        (bed.start - start) + 4
                    )  #the first nt of the gff region is five_prime_offset 4
                    #print chrm, start, stop, bed.start, bed.end, fpo
                    if phastconsdict.has_key(oligo) == False:
                        phastconsdict[oligo] = {}
                    if phastconsdict[oligo].has_key(
                            fpo
                    ) == False and fpo >= 10 and fpo <= 90:  #discount anything near a splice site
                        phastconsdict[oligo][fpo] = []
                        bpprob = bpprobdict[oligo][0][fpo]
                        phastconsdict[oligo][fpo] = [bpprob, phastconsscore]
            except ValueError:
                print 'WARNING: problem with {0}.'.format(oligo)

        elif strand == '-':
            try:
                for bed in phastconstabix.fetch(chrm,
                                                start - 1,
                                                stop - 1,
                                                parser=pysam.asBed()):
                    if not (bed.start >= start and bed.end <= stop):
                        continue
                    phastconsscore = float(bed.name)
                    fpo = int((stop - bed.start) + 3)
                    #print chrm, start, stop, bed.start, bed.end, fpo
                    if phastconsdict.has_key(oligo) == False:
                        phastconsdict[oligo] = {}
                    if phastconsdict[oligo].has_key(
                            fpo
                    ) == False and fpo >= 10 and fpo <= 90:  #discount anything near a splice site
                        phastconsdict[oligo][fpo] = []
                        bpprob = bpprobdict[oligo][0][fpo]
                        phastconsdict[oligo][fpo] = [bpprob, phastconsscore]
            except ValueError:
                print 'WARNING: problem with {0}.'.format(oligo)

    return phastconsdict
def main():
    print(pileup_file)
    # cycle through each gene in region file
    for region_file in os.listdir(region_dir):
        region_bed = open(region_dir + region_file, 'r')
        for line in region_bed:
            line = line.rstrip().split()
            try:
                line_chrm = int(line[0])
            except ValueError:
                line_chrm = line[0]
            line_start = int(line[1])
            line_end = int(line[2])

        # cycle through each VCF per gene
        for vcf_file in os.listdir(VCF_dir):
            # make sure to only target the proper vcf file
            if ".gz" in vcf_file and ".tbi" not in vcf_file:
                vcf_tbx = pysam.TabixFile(VCF_dir + vcf_file)
                for vcf_hit in vcf_tbx.fetch(line_chrm,
                                             line_start,
                                             line_end,
                                             parser=pysam.asBed()):
                    # store interesting values from vcf file
                    try:
                        vcf_chrm = int(vcf_hit[0])
                    except ValueError:
                        vcf_chrm = vcf_hit[0]
                    vcf_pos = int(vcf_hit[1])
                    vcf_ref = vcf_hit[3]
                    vcf_alt = vcf_hit[4]
                    vcf_genotype = vcf_hit[9].split(':')[0]
                    # convert '0/1' to ['A','G']
                    allele_genotype = get_allele_genotype(
                        vcf_genotype, vcf_ref, vcf_alt)

                    # find matching hits in pileup file
                    pileup_tbx = pysam.TabixFile(pileup_file)
                    for pileup_hit in pileup_tbx.fetch(vcf_chrm,
                                                       vcf_pos,
                                                       vcf_pos + 1,
                                                       parser=pysam.asBed()):
                        pileup_ref = pileup_hit[3]
                        pileup_read_count = float(pileup_hit[4])
                        pileup_base_read = pileup_hit[5]
                        ref_matches = find_ref_matches(pileup_base_read)
                        alt_matches = find_alt_matches(pileup_base_read,
                                                       allele_genotype)

                        try:
                            match_contribution = float(ref_matches +
                                                       alt_matches) / float(
                                                           pileup_read_count)
                        except ZeroDivisionError:
                            continue
                        print(
                            str(vcf_chrm) + '\t' + str(vcf_pos) + '\t' +
                            str(vcf_pos + 1) + '\t' + str(match_contribution) +
                            '\t' + str(region_file.split('.')[0]))
def slidingwindowmedian(coords, tbx):
    #Given a set of exonic coordinates (could be one or more exons), slide a window across the joined exons,
    #taking the median score of that window and recording it.
    #coords is the value of exoniccoords for one gene key
    #e.g. {chr2 : [[123123, 131232], [134343, 145223]]}

    medianwindowscores = []
    chrm = list(coords.keys())[0]
    joinedexoncoords = []  #every nt that is exonic, joined together
    windowsize = 100
    slidesize = 20
    for exon in coords[chrm]:
        joinedexoncoords += list(range(exon[0], exon[1] + 1))

    if len(joinedexoncoords) < windowsize:
        scores = []
        for coord in joinedexoncoords:
            for row in tbx.fetch(chrm, coord, coord + 1, parser=pysam.asBed()):
                scores.append(float(row.score))
        medianscore = np.mean(scores)

        return [medianscore]

    elif len(joinedexoncoords) >= windowsize:
        currentind = 0
        for ind, coord in enumerate(joinedexoncoords):
            while currentind + windowsize <= len(joinedexoncoords):
                currentwindowcoords = joinedexoncoords[currentind:currentind +
                                                       windowsize]
                #print('Current window: {0}, {1}'.format(currentwindowcoords[0], currentwindowcoords[-1]))
                #OK now break this window up into chunks of consecutive integers
                #https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list
                consecutivechunks = []
                for k, g in groupby(enumerate(currentwindowcoords),
                                    lambda ix: ix[0] - ix[1]):
                    consecutivechunk = list(map(itemgetter(1), g))
                    consecutivechunks.append(
                        [consecutivechunk[0], consecutivechunk[-1]])

                currentwindowscores = []
                for chunk in consecutivechunks:
                    #print('Current chunk: {0}, {1}'.format(chunk[0], chunk[-1]))
                    for row in tbx.fetch(chrm,
                                         chunk[0],
                                         chunk[-1],
                                         parser=pysam.asBed()):
                        currentwindowscores.append(float(row.score))

                if len(
                        currentwindowscores
                ) == 0:  #if there were no coords in this window that had a score
                    medianwindowscores.append('NA')
                elif currentwindowscores:
                    medianwindowscore = np.mean(currentwindowscores)
                    medianwindowscores.append(medianwindowscore)
                currentind += slidesize

        return medianwindowscores
Example #5
0
def iterate_bed(bed_file, merge_intervals):
    if merge_intervals:
        contig, start, end = None, None, None
        for bed in bed_file.fetch(parser=pysam.asBed()):
            if contig != bed.contig:
                if contig is not None:
                    yield contig, start, end
                contig = bed.contig
                start, end = bed.start, bed.end
            end = bed.end
        yield contig, start, end
    else:
        for bed in bed_file.fetch(parser=pysam.asBed()):
            yield bed.contig, bed.start, bed.end
Example #6
0
    def _run(self, _config, temp):
        def keyfunc(bed):
            return (bed.contig, bed.name, bed.start)

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._intervals) as bedfile:
            intervals = text.parse_lines_by_contig(bedfile,
                                                   pysam.asBed()).items()
            for (contig, beds) in sorted(intervals):
                beds.sort(key=keyfunc)

                for (gene,
                     gene_beds) in itertools.groupby(beds, lambda x: x.name):
                    gene_beds = tuple(gene_beds)
                    for bed in gene_beds:
                        seqs[(contig, gene)].append(
                            fastafile.fetch(contig, bed.start, bed.end))

                    seq = "".join(seqs[(contig, gene)])
                    if any((bed.strand == "-") for bed in gene_beds):
                        assert all((bed.strand == "-") for bed in gene_beds)
                        seq = sequences.reverse_complement(seq)
                    seqs[(contig, gene)] = seq

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                fasta.print_fasta(gene, sequence, out_file)

        move_file(temp_file, self._outfile)
Example #7
0
def combineMergedIntervals(bedfiles):
    '''combine intervals in a collection of bed files.

    Overlapping intervals between tracks are merged.

    Algorithm:

    1. collect all intervals in all tracks into a single track
    2. merge overlapping intervals 
    3. report all intervals that overlap with an interval in each track.

    '''

    # get all intervals
    data_per_contig = collections.defaultdict(list)
    for bedfile in bedfiles:
        for contig in bedfile.contigs:
            i = []
            for bed in bedfile.fetch(contig, parser=pysam.asBed()):
                i.append((bed.start, bed.end))
            data_per_contig[contig].extend(i)

    # merge intervals
    for contig in data_per_contig.keys():
        data_per_contig[contig] = Intervals.combine(data_per_contig[contig])

    # filter intervals - take only those present in all bedfiles
    for contig, data in data_per_contig.iteritems():
        for start, end in data:
            if isContainedInAll(contig, start, end, bedfiles):
                yield contig, start, end
Example #8
0
 def _stat_areas_of_interest(cls, prefixes):
     """Returns (size, number of named intervals, total number of intervals)
     for a set of areas of interest."""
     areas_of_interest = {}
     for (prefix_name, prefix) in prefixes.iteritems():
         prefix_label = prefix.get("Label", prefix_name)
         for (aoi_name, aoi_filename) in prefix.get("AreasOfInterest",
                                                    {}).iteritems():
             count, names, size = 0, set(), 0
             with open(aoi_filename) as handle:
                 parser = pysam.asBed()
                 for line in handle:
                     bed = parser(line, len(line))
                     names.add(bed.name if len(bed) >= 4 else (bed.contig +
                                                               "*"))
                     size += (bed.end - bed.start)
                     count += 1
             areas_of_interest[(prefix_name, aoi_name)] = {
                 "Size": size,
                 "NFeatures": len(names),
                 "NIntervals": count,
                 "Genome": prefix["Name"],
                 "Name": aoi_name,
                 "Label": "%s:%s" % (prefix_label, aoi_name),
                 "Path": aoi_filename
             }
     return areas_of_interest
def main():
    pileup_tbx = pysam.TabixFile(pileup_file)

    # cycle through each gene in region file
    for region_file in os.listdir(region_dir):
        region_bed = open(region_dir + region_file, 'r')
        for line in region_bed:
            line = line.rstrip().split()
            try:
                line_chrm = int(line[0])
            except ValueError:
                line_chrm = line[0]
            line_start = int(line[1])
            line_end = int(line[2])

            for pileup_hit in pileup_tbx.fetch(line_chrm,
                                               line_start,
                                               line_end,
                                               parser=pysam.asBed()):
                pileup_chrm = pileup_hit[0]
                pileup_start = pileup_hit[1]
                pileup_end = pileup_hit[2]
                pileup_base_read = pileup_hit[6]

                quality_score = calculate_quality(pileup_base_read)
                print(
                    str(pileup_chrm) + '\t' + str(pileup_start) + '\t' +
                    str(pileup_end) + '\t' + str(quality_score) + '\t' +
                    str(region_file.split('.')[0]))
Example #10
0
def combineMergedIntervals(bedfiles):
    '''combine intervals in a collection of bed files.

    Overlapping intervals between tracks are merged.

    Algorithm:

    1. collect all intervals in all tracks into a single track
    2. merge overlapping intervals
    3. report all intervals that overlap with an interval in each track.

    '''

    # get all intervals
    data_per_contig = collections.defaultdict(list)

    for bedfile in bedfiles:
        for contig in bedfile.contigs:
            i = []
            for bed in bedfile.fetch(contig, parser=pysam.asBed()):
                i.append((bed.start, bed.end))
            data_per_contig[contig].extend(i)

    # merge intervals
    for contig in list(data_per_contig.keys()):
        data_per_contig[contig] = Intervals.combine(data_per_contig[contig])

    # filter intervals - take only those present in all bedfiles
    for contig, data in sorted(data_per_contig.items()):
        for start, end in data:
            if isContainedInAll(contig, start, end, bedfiles):
                yield contig, start, end
Example #11
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--genotype",
                        help="Tabix indexed pileup file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding",
                        type=int,
                        default=10,
                        help="Number of bases to expand intervals, when "
                        "filtering based on adjacent indels [%default]")
    parser.add_argument("--min-distance-to-indels",
                        type=int,
                        default=5,
                        help="Variants closer than this distance from indels "
                        "are filtered [%default].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0
Example #12
0
def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):

            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Example #13
0
def load_gap_intervals(gap_file):
    if gap_file is None: return []
    logger.info("Loading the gaps in the genome from %s" % gap_file)
    with open(gap_file) as gap_file_fd:
        gap_intervals = [SVInterval(it.contig, it.start, it.end, it.name, "gap") for it in
                         pysam.tabix_file_iterator(gap_file_fd, parser=pysam.asBed())]
    return merge_intervals(gap_intervals)
def intersect(gff, tbx):
    phastconsvalues = []
    coords = []  #nested list of [chrm, start, stop] for each line in tempgff
    utrexonlengths = []  #lengths of all utr exons

    with open(gff, 'r') as infh:
        for line in infh:
            line = line.strip().split('\t')
            chrm, start, stop = line[0], int(line[3]), int(line[4])
            coords.append([chrm, start, stop])

    for coord in coords:
        utrexonlengths.append(coord[2] - coord[1])
        for row in tbx.fetch(coord[0],
                             coord[1],
                             coord[2],
                             parser=pysam.asBed()):
            score = float(row.score)
            phastconsvalues.append(score)

    medphastcons = np.median(phastconsvalues)

    #Check to see if we had scores for at least some fraction of the exonic nt
    utrlength = sum(utrexonlengths)

    if len(phastconsvalues) >= (utrlength * 0.5):
        return medphastcons
    else:
        return None
Example #15
0
def load_gap_intervals(gap_file):
    if gap_file is None: return []
    logger.info("Loading the gaps in the genome from %s" % gap_file)
    with open(gap_file) as gap_file_fd:
        gap_intervals = [SVInterval(it.contig, it.start, it.end, it.name, "gap") for it in
                         pysam.tabix_file_iterator(gap_file_fd, parser=pysam.asBed())]
    return merge_intervals(gap_intervals)
Example #16
0
    def _run(self, _config, temp):
        def keyfunc(bed):
            return (bed.contig, bed.name, bed.start)

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._intervals) as bedfile:
            intervals = text.parse_lines_by_contig(bedfile, pysam.asBed()).items()
            for (contig, beds) in sorted(intervals):
                beds.sort(key = keyfunc)

                for (gene, gene_beds) in itertools.groupby(beds, lambda x: x.name):
                    gene_beds = tuple(gene_beds)
                    for bed in gene_beds:
                        seqs[(contig, gene)].append(fastafile.fetch(contig, bed.start, bed.end))

                    seq = "".join(seqs[(contig, gene)])
                    if any((bed.strand == "-") for bed in gene_beds):
                        assert all((bed.strand == "-") for bed in gene_beds)
                        seq = sequences.reverse_complement(seq)
                    seqs[(contig, gene)] = seq

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                fasta.print_fasta(gene, sequence, out_file)

        move_file(temp_file, self._outfile)
Example #17
0
def fetchProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Example #18
0
def _bed_getter(bedfile, contig, start=0, end=None, strand=".", dtype="uint16"):
    '''Get crosslink profiles from tabix indexed bedGraph/Bed'''

    # check the file contains some data for the requested contig
    if not contig in bedfile.contigs:
        #print "%s not in bedfile" % contig
        return pd.Series(dict(), dtype=dtype)
    
    # fetch the rercords from the specificed region
    crosslinks = bedfile.fetch(contig, start, end, parser=pysam.asBed())
    
    profile = dict()

    check_sum = 0
    
    for base in crosslinks:
        try:
            correct_strand = strand == "." or base.strand == strand
        except AttributeError:
            correct_strand = True
            
        if correct_strand:
            profile[float(base.start)] = int(base.score)
            check_sum += int(base.score)

    profile = pd.Series(dict(profile), dtype=dtype)

            
    #if not check_sum == profile.sum():
    #    raise OverflowError("Check sum failed (%i = %i). Possibly counts exceed specified dtype. Use bigger dtype"
    #                        % (check_sum, profile.sum()))

    return profile
def getProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):
    
    # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
    # from the pregenerated file.
    # First iteration, no comparison
    first_iteration = True
    
    length_RE_cut = 0
    
    last_bed = None
    
    for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
                
        if(first_iteration):
            first_iteration = False
        else:
            # If they are in the same contig they can be compared
            if(bed_digest.contig == last_bed.contig):
                length_RE_cut = bed_digest.start - last_bed.end
                break
        
        last_bed = bed_digest
    
    
    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            
            # The restriction enzyme cut on the left side of the fragment
            # is the end site of the last restriction enzyme fragment + 1
            # (+1 because according to the manual coordinates are specified
            # in 1-origin for the bed start.)
            
            bed.start = frag.start-length_RE_cut+1
            bed.end = frag.end+length_RE_cut
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Example #20
0
def generate_from_bed(bam_file, bed_file, stepper="nofilter"):
    for bed in bed_file.fetch(parser=pysam.asBed()):
        for v in bam_file.pileup(bed.contig,
                                 bed.start,
                                 bed.end,
                                 stepper=stepper,
                                 truncate=True):
            yield v
def getoligoscore(tbx, chrm, start, stop):
    scores = []

    for row in tbx.fetch(chrm, start, stop, parser=pysam.asBed()):
        score = float(row.score)
        scores.append(score)

    return len(scores), scores
Example #22
0
def generate_from_bed(bam_file, bed_file, **kwargs):
    for bed in bed_file.fetch(parser=pysam.asBed()):
        for v in bam_file.pileup(bed.contig,
                                 bed.start,
                                 bed.end,
                                 **kwargs,
                                 truncate=True):
            yield v
Example #23
0
def filter_bam(args, bcd):
    with open(args.output, 'w') as o:
        with gzip.open(args.fragments) as f:
            tbx = pysam.tabix_iterator(f, pysam.asBed())
            for line in tbx:
                if line.name in bcd:
                    o.write("{}\n".format(str(line)))
    return 0
Example #24
0
    def __init__(self, filename, **kwargs):
        filename = str(filename)
        if not filename.endswith('.gz'):
            if os.path.exists(filename + '.gz'):
                filename += '.gz'
            else:
                filename = self.compress(filename, create_index=True)

        super().__init__(filename, parser=pysam.asBed(), **kwargs)
Example #25
0
    def __init__(self, file_path):
        file_path = str(file_path)
        if not file_path.endswith('.gz'):
            if os.path.exists(file_path + '.gz'):
                file_path += '.gz'
            else:
                file_path = self.compress(file_path)

        super().__init__(file_path, parser=pysam.asBed())
Example #26
0
    def testRead( self ):

        for x, r in enumerate(self.tabix.fetch( parser = pysam.asBed() )):
            c = self.compare[x]
            self.assertEqual( "\t".join( c ), str(r) )
            self.assertEqual( list(c), list(r) )
            self.assertEqual( c[0], r.contig)
            self.assertEqual( int(c[1]), r.start)
            self.assertEqual( int(c[2]), r.end)
Example #27
0
    def testRead(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asBed())):
            c = self.compare[x]
            self.assertEqual("\t".join(c), str(r))
            self.assertEqual(list(c), list(r))
            self.assertEqual(c[0], r.contig)
            self.assertEqual(int(c[1]), r.start)
            self.assertEqual(int(c[2]), r.end)
Example #28
0
def threeUTRmetagene(gff, clusters):
    gff_fn = gff
    db_fn = os.path.basename(gff_fn) + '.db'

    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn)

    db = gffutils.FeatureDB(db_fn)

    clustertabix = pysam.Tabixfile(clusters)
    number_of_UTRs = float(len(list(db.features_of_type('3\'UTR'))))
    three_prime_UTRs = db.features_of_type('3\'UTR')
    binhits = {}
    bindensities = {}

    for UTR in three_prime_UTRs:
        binnumber = 1
        lowerpercent = 0
        upperpercent = 1
        UTRlength = float((UTR.stop - UTR.start))

        while upperpercent <= 100:
            nt_window = [
                int(round((UTRlength / 100) * lowerpercent)),
                int(round((UTRlength / 100) * upperpercent))
            ]
            windowstart = nt_window[0]
            windowstop = nt_window[1]
            #For every time a cluster bed entry overlaps this particular sequence window
            for bed in clustertabix.fetch(str(UTR.chrom),
                                          UTR.start + windowstart,
                                          UTR.start + windowstop,
                                          parser=pysam.asBed()):
                if UTR.strand == bed.strand:
                    if binhits.has_key(binnumber):
                        binhits[binnumber] += 1
                    else:
                        binhits[binnumber] = 1
            lowerpercent += 1
            upperpercent += 1
            binnumber += 1

    os.remove(db_fn)

    #Number of hits in every unpopulated bin is 0
    for x in range(1, 101):
        if x not in binhits:
            binhits[x] = 0

    for binnumber in binhits:
        rawhits = binhits[binnumber]
        density = float(rawhits / number_of_UTRs)
        bindensities[binnumber] = density

    return bindensities
Example #29
0
def locate_fragments(data: Union[AnnData, MuData],
                     fragments: str,
                     return_fragments: bool = False):
    """
    Parse fragments file and add a variable to access it to the .uns["files"]["fragments"]

    Fragments file is never read to memory, and connection to the file is closed
    upon function completion.

    Parameters
    ----------
    data
            AnnData object with peak counts or multimodal MuData object with 'atac' modality.
    fragments
            A path to the compressed tab-separated fragments file (e.g. atac_fragments.tsv.gz).
    return_fragments
            If return the Tabix connection the fragments file. False by default.
    """
    frag = None
    try:
        if isinstance(data, AnnData):
            adata = data
        elif isinstance(data, MuData) and "atac" in data.mod:
            adata = data.mod["atac"]
        else:
            raise TypeError(
                "Expected AnnData or MuData object with 'atac' modality")

        try:
            import pysam
        except ImportError:
            raise ImportError(
                "pysam is not available. It is required to work with the fragments file. \
                Install pysam from PyPI (`pip install pysam`) \
                or from GitHub (`pip install git+https://github.com/pysam-developers/pysam`)"
            )

        # Here we make sure we can create a connection to the fragments file
        frag = pysam.TabixFile(fragments, parser=pysam.asBed())

        if "files" not in adata.uns:
            adata.uns["files"] = OrderedDict()
        adata.uns["files"]["fragments"] = fragments

        if return_fragments:
            return frag

    except Exception as e:
        print(e)

    finally:
        if frag is not None and not return_fragments:
            # The connection has to be closed
            frag.close()
Example #30
0
def main(path_input):

    # read the original fragments file
    df = pd.read_csv(path_input,
                     sep="\t",
                     header=None,
                     names=["chr", "start", "end", "cb", "counts"])

    # generate a table describing each chromosome's length
    chrs = "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY chrM"
    ends = "197195432 181748087 159599783 155630120 152537259 149517037 152524553 131738871 124076172 129993255 121843856 121257530 120284312 125194864 103494974 98319150 95272651 90772031 61342430 166650296 15902555 16299"

    chrs = chrs.split(" ")
    ends = ends.split(" ")

    df_contigs = pd.DataFrame({"chromosome": chrs, "len": ends})

    df_contigs.len = df_contigs.len.astype(int)

    # find problematic fragments that exceed contig size (due to cell ranger bug)
    problematics = []

    with pysam.Tabixfile(path_input) as tbx:
        for i, contig in df_contigs[
                df_contigs.chromosome != "chrM"].iterrows():

            for row in tbx.fetch(contig.chromosome,
                                 contig.len - 1,
                                 contig.len,
                                 parser=pysam.asBed()):
                problematics.append(row)

    # write to problematics.csv
    df_problematics = pd.DataFrame(
        map(lambda item: (item.contig, item.start, item.end, item.name),
            problematics))
    df_problematics.to_csv("problematics.csv",
                           sep="\t",
                           header=False,
                           index=False)

    # iterate through problematic fragments and correct the coordinates
    for p in bar(problematics):

        # retrieve the proper end of the contig
        proper_end = df_contigs.loc[df_contigs.chromosome == p.contig,
                                    "len"].values[0]

        # correct
        df.loc[(df.chr == p.contig) & (df.start == p.start)
               & (df.cb == p.name), "end"] = proper_end

    # write to disk
    df.to_csv("fragments.tsv", sep="\t", header=False, index=False)
Example #31
0
def _collect_and_validate_regions(regions):
    contigs = _collect_fasta_contigs(regions)
    parser = pysam.asBed()
    sequences = set()
    with open(regions["BED"]) as bedhandle:
        for (line_num, line) in enumerate(bedhandle):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            try:
                bed = parser(line, len(line))
                # Force evaluation of (lazily parsed) properties
                bed_start = bed.start
                bed_end = bed.end
            except ValueError, error:
                raise MakefileError(("Error parsing line %i in regions file:\n"
                                     "  Path = %r\n  Line = %r\n\n%s")
                                    % (line_num + 1, regions["BED"],
                                       line, error))

            if len(bed) < 6:
                url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
                name = repr(bed.name) if len(bed) > 3 else "unnamed record"
                raise MakefileError(("Region at line #%i (%s) does not "
                                     "contain the expected number of fields; "
                                     "the first 6 fields are required. C.f. "
                                     "defination at\n   %s\n\nPath = %r")
                                    % (line_num, name, url, regions["BED"]))

            contig_len = contigs.get(bed.contig)
            if contig_len is None:
                raise MakefileError(("Regions file contains contig not found "
                                     "in reference:\n  Path = %r\n  Contig = "
                                     "%r\n\nPlease ensure that all contig "
                                     "names match the reference names!")
                                    % (regions["BED"], bed.contig))
            elif not (0 <= int(bed_start) < int(bed_end) <= contig_len):
                raise MakefileError(("Regions file contains invalid region:\n"
                                     "  Path   = %r\n  Contig = %r\n"
                                     "  Start  = %s\n  End    = %s\n\n"
                                     "Expected 0 <= Start < End <= %i!")
                                    % (regions["BED"], bed.contig, bed.start,
                                       bed.end, contig_len))
            elif bed.strand not in "+-":
                raise MakefileError(("Regions file contains invalid region: "
                                     "  Path   = %r\n  Line = %i\n  Name = %r"
                                     "\nStrand is %r, expected '+' or '-'.")
                                    % (regions["BED"], line_num, bed.name,
                                       bed.strand))

            sequences.add(bed.name)
Example #32
0
def _collect_and_validate_regions(regions):
    contigs = _collect_fasta_contigs(regions)
    parser = pysam.asBed()
    sequences = set()
    with open(regions["BED"]) as bedhandle:
        for (line_num, line) in enumerate(bedhandle):
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            try:
                bed = parser(line, len(line))
                # Force evaluation of (lazily parsed) properties
                bed_start = bed.start
                bed_end = bed.end
            except ValueError, error:
                raise MakefileError(
                    ("Error parsing line %i in regions file:\n"
                     "  Path = %r\n  Line = %r\n\n%s") %
                    (line_num + 1, regions["BED"], line, error))

            if len(bed) < 6:
                url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
                name = repr(bed.name) if len(bed) > 3 else "unnamed record"
                raise MakefileError(("Region at line #%i (%s) does not "
                                     "contain the expected number of fields; "
                                     "the first 6 fields are required. C.f. "
                                     "defination at\n   %s\n\nPath = %r") %
                                    (line_num, name, url, regions["BED"]))

            contig_len = contigs.get(bed.contig)
            if contig_len is None:
                raise MakefileError(("Regions file contains contig not found "
                                     "in reference:\n  Path = %r\n  Contig = "
                                     "%r\n\nPlease ensure that all contig "
                                     "names match the reference names!") %
                                    (regions["BED"], bed.contig))
            elif not (0 <= int(bed_start) < int(bed_end) <= contig_len):
                raise MakefileError(("Regions file contains invalid region:\n"
                                     "  Path   = %r\n  Contig = %r\n"
                                     "  Start  = %s\n  End    = %s\n\n"
                                     "Expected 0 <= Start < End <= %i!") %
                                    (regions["BED"], bed.contig, bed.start,
                                     bed.end, contig_len))
            elif bed.strand not in "+-":
                raise MakefileError(
                    ("Regions file contains invalid region: "
                     "  Path   = %r\n  Line = %i\n  Name = %r"
                     "\nStrand is %r, expected '+' or '-'.") %
                    (regions["BED"], line_num, bed.name, bed.strand))

            sequences.add(bed.name)
Example #33
0
def getKmerPhastcons(fastadict, k, phastconsbed):
    k = int(k)
    phastconsdict = {}  #{kmer:mean_phastcons_score_of_bases_in_kmer}
    phastconsaveragedict = {}  #{kmer:mean of scores in phastconsdict}
    UTRcounter = 0
    phastconstabix = pysam.Tabixfile(phastconsbed)

    for UTR in fastadict:
        UTRcounter += 1
        if UTRcounter % 50 == 0:
            sys.stderr.write(
                'Determining motif conservation in UTR {0} of {1}...\n'.format(
                    UTRcounter, len(fastadict)))
        UTRsequence = fastadict[UTR]
        UTR = UTR.replace(';', '\t').split('\t')
        ID = UTR[0]
        chrm = UTR[1]
        start = int(UTR[2])
        stop = int(UTR[3])
        strand = UTR[4]
        for i in range(len(UTRsequence) - k + 1):
            if strand == '+':
                mousekmer = UTRsequence[i:i + k]
                mousekmerstart = start + i
                mousekmerstop = start + i + k - 1
            elif strand == '-':
                mousekmer = UTRsequence[i:i + k]
                mousekmerstart = stop - i - k + 1
                mousekmerstop = stop - i
            kmerscores = [
            ]  #list of phastcons scores for every bp of this kmer
            #Remember...input gff coords are 1-based and the phastconsbed is 0-based
            for bed in phastconstabix.fetch(str(chrm),
                                            mousekmerstart - 1,
                                            mousekmerstop,
                                            parser=pysam.asBed()):
                kmerscores.append(float(bed.name))
            if len(kmerscores) == k:  #if every base in the kmer had a score
                kmeraveragescore = (sum(kmerscores) / float(len(kmerscores)))
                if phastconsdict.has_key(mousekmer) == False:
                    phastconsdict[mousekmer] = [
                        kmeraveragescore
                    ]  #if kmer not in dictionary, initialize entry
                elif phastconsdict.has_key(mousekmer):
                    phastconsdict[mousekmer].append(kmeraveragescore)
            elif len(kmerscores
                     ) != k:  #if not every base in the kmer had a score
                continue

    for kmer in phastconsdict:
        phastconsaveragedict[kmer] = np.mean(phastconsdict[kmer])
    return phastconsaveragedict
def read_bed_records(filename):
    """Reads a bed-file (i.e. for a set of regions of interest), and returns
    a sorted list containing each line as a tuple containing the contig name,
    the start position, and the end position."""
    regions = []
    bed_parser = pysam.asBed()
    with open(filename) as bed_file:
        for line in bed_file:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            regions.append(bed_parser(line, len(line)))
    return regions
def filter_bam(args,bcd):
  reads     = {}
  replicate = {}
  with gzip.open(args.fragments) as f:
    tbx = pysam.tabix_iterator(f,pysam.asBed())
    for line in tbx:
      if line.name in bcd:
        try:
          reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]].append(str(line))
        except KeyError:
          reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]]     = [str(line)]
          
  return(reads)
def read_bed_records(filename):
    """Reads a bed-file (i.e. for a set of regions of interest), and returns
    a sorted list containing each line as a tuple containing the contig name,
    the start position, and the end position."""
    regions = []
    bed_parser = pysam.asBed()
    with open(filename) as bed_file:
        for line in bed_file:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            regions.append(bed_parser(line, len(line)))
    return regions
def get_reads(tbx, chrm, start, end):

    reads = []

    # pull out the quality scores
    for row in tbx.fetch(chrm, start, end, parser=pysam.asBed()):
        c = row[0]
        s = row[1]
        e = row[2]
        r = row[5]
        cse = [c, s, e]
        curr_row = [cse, r]
        reads.append(curr_row)
    #for r in reads: print (r)
    return reads
Example #38
0
def combineUnmergedIntervals(foreground, background):
    '''combine intervals in a collection of bed files.

    Only intervals in the first track are reported.

    Algorithm:

    1. report all intervals in the first track that overlap with an interval in every other track.
    '''

    intervals = []
    c = 0
    for bed in foreground.fetch(parser=pysam.asBed()):
        c += 1
        if isContainedInAll(bed.contig, bed.start, bed.end, background):
            yield bed
Example #39
0
def combineUnmergedIntervals(foreground, background):
    '''combine intervals in a collection of bed files.
    
    Only intervals in the first track are reported.

    Algorithm:

    1. report all intervals in the first track that overlap with an interval in every other track.
    '''

    intervals = []
    c = 0
    for bed in foreground.fetch(parser=pysam.asBed()):
        c += 1
        if isContainedInAll(bed.contig, bed.start, bed.end, background):
            yield bed
Example #40
0
def getphastcons(kmerpos, phastconsbed, outfile, protein, RBNSstate):
    #kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}}
    phastconsdict = {
    }  # {age : {location : [meanphastcons of oligo1 around motif, meanphastcons of oligo2 around motif]}}
    phastconstabix = pysam.Tabixfile(phastconsbed)
    for age in kmerpos:
        phastconsdict[age] = {}
        for location in kmerpos[age]:
            phastconsdict[age][location] = []
            for kmer in kmerpos[age][location]:
                chrm = kmer[0]
                kmerstart = int(kmer[1])
                kmerstop = int(kmer[2])
                strand = kmer[3]
                phastconsscores = []
                windowstart = kmerstart - 25
                windowend = kmerstop + 25
                try:
                    for bed in phastconstabix.fetch(chrm,
                                                    windowstart,
                                                    windowend,
                                                    parser=pysam.asBed()):
                        phastconsscore = float(bed.name)
                        phastconsscores.append(phastconsscore)
                except ValueError:
                    print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(
                        str(chrm), str(kmerstart), str(kmerstop), strand)

                if len(
                        phastconsscores
                ) > 0:  #if there were any bases in the region that had phastcons scores
                    meanphastcons = mean(phastconsscores)
                    phastconsdict[age][location].append(meanphastcons)

    if not os.path.isfile(outfile):
        with open(outfile, 'w') as f:
            f.write(('\t').join(
                ['age', 'location', 'protein', 'RBNSstate', 'meanphastcons']) +
                    '\n')

    for age in phastconsdict:
        for location in phastconsdict[age]:
            for score in phastconsdict[age][location]:
                with open(outfile, 'a') as f:
                    f.write(('\t').join(
                        [age, location, protein, RBNSstate,
                         str(score)]) + '\n')
Example #41
0
def _bed_getter(bedfile, contig, start=0, end=None, strand=".", dtype="uint16"):
    '''Get crosslink profiles from tabix indexed bedGraph/Bed'''

    # check the file contains some data for the requested contig
    if not contig in bedfile.contigs:
        #print "%s not in bedfile" % contig
        return pd.Series(dict(), dtype=dtype)
    
    # fetch the rercords from the specificed region
    crosslinks = bedfile.fetch(contig, start, end, parser=pysam.asBed())
    
    profile = dict()

    check_sum = 0
    
    for base in crosslinks:
        try:
            correct_strand = strand == "." or base.strand == strand
        except AttributeError:
            correct_strand = True
        except KeyError:
            correct_strand = True
            
        if correct_strand:

            try:
                profile[float(base.start)] = int(base.score)
                check_sum += int(base.score)
            except AttributeError:
                profile[float(base.start)] = 1
                check_sum += 1
            except KeyError:
                profile[float(base.start)] = 1
                check_sum += 1

    if len(profile.keys())==0:
        profile = pd.Series(profile, dtype=dtype, index=pd.Index([], dtype="float"))
        
    profile = pd.Series(dict(profile), dtype=dtype)

            
    #if not check_sum == profile.sum():
    #    raise OverflowError("Check sum failed (%i = %i). Possibly counts exceed specified dtype. Use bigger dtype"
    #                        % (check_sum, profile.sum()))

    return profile
Example #42
0
def read_bed_file(filename):
    """Parses a (gzip/bzip2 compressed) BED file, and yields
    a sequence of records. Comments and empty lines are skipped."""
    handle = None
    try:
        handle = fileutils.open_ro(filename)
        parser = pysam.asBed()

        for record in text.parse_lines(handle, parser):
            # Force evaluation of (lazily parsed) properties
            _ = record.start
            _ = record.end

            yield record

    finally:
        if handle:
            handle.close()
Example #43
0
    def testWrite(self):

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asBed())):
            c = self.compare[x]
            self.assertEqual(c, str(r).split("\t"))
            self.assertEqual(list(c), list(r))

            r.contig = "test"
            self.assertEqual("test", r.contig)
            self.assertEqual("test", r[0])

            r.start += 1
            self.assertEqual(int(c[1]) + 1, r.start)
            self.assertEqual(str(int(c[1]) + 1), r[1])

            r.end += 1
            self.assertEqual(int(c[2]) + 1, r.end)
            self.assertEqual(str(int(c[2]) + 1), r[2])
Example #44
0
def _get_hits(coords, annotation, parser_type):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    if parser_type == "bed":
        parser = pysam.asBed()
    elif parser_type == "vcf":
        parser = pysam.asVCF()
    elif parser_type == "tuple":
        parser = pysam.asTuple()
    elif parser_type is None:
        parser = None
    else:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
    except ValueError:
        hit_iter = []
    return hit_iter
Example #45
0
    def _run(self, _config, temp):
        def _by_name(bed):
            return bed.name

        fastafile = pysam.Fastafile(self._reference)
        seqs = collections.defaultdict(list)
        with open(self._bedfile) as bedfile:
            bedrecords = text.parse_lines_by_contig(bedfile, pysam.asBed())
            for (contig, beds) in sorted(bedrecords.iteritems()):
                beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))

                for (gene, gene_beds) in itertools.groupby(beds, _by_name):
                    gene_beds = tuple(gene_beds)
                    sequence = self._collect_sequence(fastafile, gene_beds)
                    seqs[(contig, gene)] = sequence

        temp_file = os.path.join(temp, "sequences.fasta")
        with open(temp_file, "w") as out_file:
            for ((_, gene), sequence) in sorted(seqs.items()):
                FASTA(gene, None, sequence).write(out_file)

        fileutils.move_file(temp_file, self._outfile)
Example #46
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--genotype", help="Tabix indexed pileup file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding", type=int, default=10,
                        help="Number of bases to expand intervals, when "
                             "filtering based on adjacent indels [%default]")
    parser.add_argument("--min-distance-to-indels", type=int, default=5,
                        help="Variants closer than this distance from indels "
                             "are filtered [%default].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0
Example #47
0
 def _stat_areas_of_interest(cls, prefixes):
     """Returns (size, number of named intervals, total number of intervals)
     for a set of areas of interest."""
     areas_of_interest = {}
     for (prefix_name, prefix) in prefixes.iteritems():
         prefix_label = prefix.get("Label", prefix_name)
         for (roi_name, roi_filename) in prefix.get("RegionsOfInterest", {}).iteritems():
             count, names, size = 0, set(), 0
             with open(roi_filename) as handle:
                 parser = pysam.asBed()
                 for line in handle:
                     bed = parser(line, len(line))
                     names.add(bed.name if len(bed) >= 4 else (bed.contig + "*"))
                     size += (bed.end - bed.start)
                     count += 1
             areas_of_interest[(prefix_name, roi_name)] = {"Size"       : size,
                                                           "NFeatures"  : len(names),
                                                           "NIntervals" : count,
                                                           "Genome"     : prefix["Name"],
                                                           "Name"       : roi_name,
                                                           "Label"      : "%s:%s" % (prefix_label, roi_name),
                                                           "Path"       : roi_filename}
     return areas_of_interest
Example #48
0
def read_intervals(filename):
    with open(filename) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

        for (key, beds) in intervals.iteritems():
            bed_tuples = []
            for bed in beds:
                if len(bed) < 6:
                    sys.stderr.write(("ERROR: Invalid BED record '%s', must "
                                      "have at least 6 fields ...\n") %
                                     ("\\t".join(bed),))
                    return None

                # Transform to a named tuple, as Pysam has a tendency to
                # segfault if you do anything wrong
                bed = list(bed)[:6]   # BED6 only
                bed[1] = int(bed[1])  # start
                bed[2] = int(bed[2])  # end
                bed[4] = int(bed[4])  # score

                bed_tuples.append(BEDTuple(*bed))
            intervals[key] = bed_tuples

    return intervals
Example #49
0
def fetch_parsed(fn):
    with pysam.Tabixfile(fn) as f:
        return len(list(f.fetch(parser=pysam.asBed())))
Example #50
0
def test_fetch_parsed():
    """Stupid test function"""
    f = pysam.Tabixfile(fn_compressed)
    l = len( list(f.fetch( parser = pysam.asBed())) )
Example #51
0
def iterate_file_uncompressed(fn):
    with open(fn) as f:
        return len(list(pysam.tabix_file_iterator(f, parser=pysam.asBed())))
Example #52
0
def iterate_parsed_compressed(fn):
    with gzip.open(fn) as f:
        return len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
Example #53
0
                from bx.bbi.bigwig_file import BigWigFile
                annos[anno] = BigWigFile(open(anno_files[anno]))

        except IOError:
            sys.exit("Gemini cannot open this annotation file: %s. \n"
                     "Have you installed the annotation files?  If so, "
                     "have they been moved or deleted? Exiting...\n\n"
                     "For more details:\n\t"
                     "http://gemini.readthedocs.org/en/latest/content/"
                     "#installation.html\#installing-annotation-files\n"
                     % anno_files[anno])

# ## Standard access to Tabix indexed files


PARSERS = {"bed": pysam.asBed(),
           "vcf": pysam.asVCF(),
           "tuple": pysam.asTuple(),
           None: None}

def _get_hits(coords, annotation, parser_type, _parsers=PARSERS):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    try:
        parser = _parsers[parser_type]
    except KeyError:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
Example #54
0
def test_iterator_file_uncompressed():
    f = open("windows_small.bed")
    l = len( list( pysam.tabix_file_iterator( f, parser = pysam.asBed() )))
Example #55
0
def test_iterator_parsed_compressed():
    f = gzip.open(fn_compressed)
    l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() )))