예제 #1
0
def getFeatureCount(f,todir="./data",mode="cN"):
    """
    """
    n = f.split("/")[-1].replace(".bedpe.gz", "")
    fout = todir + "/" + n + '_%s.txt'%mode
    if os.path.isfile(fout):
        return
    #print(f)
    ss = {}
    for i, line in enumerate(gzip.open(f)):
        if i % 10000 == 0:
            cFlush("%s read from %s" % (i, f))
        line = line.split("\n")[0].split("\t")
        if line[0] != line[3] or line[0]=="chrM":
            continue
        s = min(int(line[1]), int(line[4]))
        e = max(int(line[2]), int(line[5]))
        d = e - s 
        m = (s+e)/2
        if mode == "cN" and 140 <= d <= 180:
            #iv = HTSeq.GenomicInterval(line[0], s, e)
            iv = HTSeq.GenomicInterval(line[0], m, m+1)
            for niv, nv in model[iv].steps():
                if nv != set([]):
                    if nv not in ss:
                        ss[nv] = 0
                    ss[nv] += 1
                    #ss.add(nv)
                    #ss.update(nv)
        elif mode == "sP" and d <= 80: 
            #iv = HTSeq.GenomicInterval(line[0], s, e)
            iv = HTSeq.GenomicInterval(line[0], m, m+1)
            for niv, nv in model[iv].steps():
                if nv != set([]):
                    if nv not in ss:
                        ss[nv] = 0
                    ss[nv] += 1
                    #ss.update(nv)
                    #ss.add( nv )
        else:
            continue
    print()
    ss = pd.Series(ss)
    ss.to_csv(fout,sep="\t")
    #with open(fout, "w") as fo:
    #    fo.write("\n".join(list(ss)))
    print(f, "finished")
    logger.info("file:%s,mode:%s,features:%s"%(f,mode,len(ss)))
예제 #2
0
파일: RNA.py 프로젝트: dfporter/easyCLIP
    def find_introns(self):
        """Returns false if no error encountered, true otherwise.
        """

        # Don't do this again if introns are already defined.
        if 'intron' in self.elements and len(self.elements['intron']):
            return False

        self.elements['intron'] = []

        if ('exon' not in self.elements) or (len(self.elements['exon']) == 1):
            return False

        exons = self.elements['exon']
        exons = sorted(exons, key=lambda x: x.start)

        for exon_n, exon in enumerate(exons):

            if exon_n >= len(exons) - 1:
                return False

            if exons[exon_n].end >= exons[exon_n + 1].start:
                print(f"Exons not intrepretable:", exons[exon_n],
                      exons[exon_n + 1], '\n-\n')
                return True

            self.elements['intron'].append(
                HTSeq.GenomicInterval(exon.chrom, exons[exon_n].end,
                                      exons[exon_n + 1].start, exon.strand))

        return False
def VariantCallTabReader(filepath, chrom_size):
    """
    This function aims to read the variant calls from the merged studies which
    contain various types of variant types. 

    """
    infile = pd.read_csv(filepath, sep="\t")

    # var_types is a dic keyed by var_type and valued by a list of genomic intervals
    var_types_ga = {}
    var_types_id = {}

    for _, line in infile.iterrows():
        var_type = str(line['var_type'])
        var_type = var_type.replace(" ", "_")

        if var_type not in var_types_ga.keys():
            var_types_ga[var_type] = []
            var_types_id[var_type] = []

        chrom = 'chr' + str(line['chr'])
        accesion = line[0]
        if (chrom in chrom_size.keys()):
            start = inner_outer_pref(line, 'start')
            end = inner_outer_pref(line, 'stop')

            # Create a 'Genomic interval' from this variant call
            iv = HTSeq.GenomicInterval(chrom, start, end, ".")
            var_types_ga[var_type].append(iv)
            var_types_id[var_type].append(accesion)
    return ((var_types_ga, var_types_id))
예제 #4
0
def Get_IPAsite_IPUI(input_tuple):
    IPAevent,curr_label_all_ga,gas = input_tuple
    label,intron_rank,IPA_inf,IPAtype = IPAevent.split(";")
    intronrank = int(intron_rank.split("_")[1])
    position_list = list(map(int,IPA_inf.split(":")[1].split('-')))
    SYMBOL = label.split(":")[1].split("|")[0]
    result = []
    for feature,rank,chrom,start,end,strand,length,exon_rank_left,exon_rank_right in annot[label]:
        if feature == "intron" and int(rank) == intronrank:
            iv = HTSeq.GenomicInterval(chrom,start,end,strand)
            IPAstart = position_list[0]-int(start)
            IPA_location = position_list[1]-int(start)
            curr_label_all_cov = []
            for ga in curr_label_all_ga:
                if strand == "-":
                    curr_label_all_cov.append(list(ga[iv])[::-1])
                    IPAstart = int(end)-position_list[1]
                    IPA_location = int(end)-position_list[0]
                else:
                    curr_label_all_cov.append(list(ga[iv]))
            IPA_isoform_abundance = [np.mean(cvg_region[IPAstart:(IPAstart+int(int((IPA_location-IPAstart)/1.5)))]) for cvg_region in curr_label_all_cov]
            if strand == "+":
                exon_iv = tuple(i[0] for i in gas.steps() if i[1] == {('exon',int(exon_rank_left))})
            else:
                exon_iv = tuple(i[0] for i in gas.steps() if i[1] == {('exon',int(exon_rank_right))})
            if len(exon_iv) == 1:
                exon_abundance = [np.mean(sorted(list(ga[exon_iv[0]]),reverse=True)[:30]) for ga in curr_label_all_ga]
                if sum(np.array(exon_abundance)>10) == len(all_bamfiles) and sum(np.array([x-y for x,y in zip(exon_abundance,IPA_isoform_abundance)])>0)>len(all_bamfiles)*0.5:
                    IPARatio_list = [round(x/y,3) for x,y in zip(IPA_isoform_abundance,exon_abundance)]
                    IPUI_condition_diff = round(IPARatio_list[1] - IPARatio_list[0],3)
                    ratio_val,P_val = sp.stats.fisher_exact([[IPA_isoform_abundance[0],exon_abundance[0]-IPA_isoform_abundance[0]],[IPA_isoform_abundance[1],exon_abundance[1]-IPA_isoform_abundance[1]]])
                    result = [SYMBOL,intron_rank,IPA_inf,IPAtype]+IPARatio_list+[IPUI_condition_diff,P_val]
    return result
예제 #5
0
def getCov(f, paired=True):
    logger.info("Building coverage model for %s, paired=%s" % (f, paired))
    model = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    i = None
    uniqs = set()
    for i, line in enumerate(gzip.open(f)):
        if i % 10000 == 0:
            cFlush("%s read from %s" % (i, f))
        line = line.split("\n")[0].split("\t")
        if paired and line[0] != line[3]:
            continue
        if paired:
            s = min(int(line[1]), int(line[4]))
            e = max(int(line[2]), int(line[5]))
        else:
            s = int(line[1])
            e = int(line[2])
        r = (line[0], s, e)
        if r not in uniqs:
            iv = HTSeq.GenomicInterval(line[0], s, e)
            model[iv] += str(i)
            uniqs.add(r)
    if i is None:
        logger.error("ERROR! No read in %s." % f)
        return 0, None
    logger.info("%s read from %s, unique %s" % (i, f, len(uniqs)))
    return len(uniqs), model
    def get_annotations(self, chromosome, start, end):
        entries = set()
        for annotation in self.gas[HTSeq.GenomicInterval(
                chromosome, start, end, '.')]:
            entries = entries.union(annotation)

        return entries
예제 #7
0
    def __init__(self,
                 bam_filename,
                 chrom,
                 start,
                 stop,
                 strand,
                 log_base,
                 color,
                 bad_cigar=INSERTION_DELETIONS,
                 coverage_cigar=COVERAGE_CIGAR,
                 junction_cigar=JUNCTION_CIGAR,
                 warn_skipped=True):
        self.bam_filename = bam_filename
        self.chrom = chrom
        self.start = start
        self.stop = stop
        self.strand = strand
        self.log_base = log_base
        self.color = color
        self.bad_cigar = bad_cigar
        self.coverage_cigar = coverage_cigar
        self.junction_cigar = junction_cigar
        self.warn_skipped = warn_skipped

        self.length = self.stop - self.start + 1
        self.coordinates = self.chrom, self.start, self.stop, self.strand
        self.interval = HTSeq.GenomicInterval(*self.coordinates)

        self.bam = HTSeq.BAM_Reader(self.bam_filename)

        self.coverage = self.count_coverage()
        self.junctions = self.count_junctions()
예제 #8
0
def extract_splice_sites(file, bin):
    gtf_file = HTSeq.GFF_Reader(file)
    cvg = HTSeq.GenomicArray("auto", stranded=False, typecode='i')
    for feature in gtf_file:
        if feature.type == 'exon':
            iv1 = HTSeq.GenomicInterval(feature.iv.chrom,
                                        feature.iv.start - bin,
                                        feature.iv.start + bin, '.')
            iv2 = HTSeq.GenomicInterval(feature.iv.chrom, feature.iv.end - bin,
                                        feature.iv.end + bin, '.')
            try:
                cvg[iv1] = 1
                cvg[iv2] = 1
            except IndexError:
                continue
    return cvg
예제 #9
0
def bg2GModel(bg):
    """
    BedGraph format, gzip or not into HTSeq.GenomicArray 
    """
    if bg.endswith(".gz"):
        f = gzip.open(bg, "rb")
    else:
        f = open(bg)
    print datetime.now(), "Start building model for %s" % bg
    model = HTSeq.GenomicArray("auto", stranded=False)
    for i, line in enumerate(f):
        if i % 10000 == 0:
            report = "%s lines genome signal read." % i
            commandFlush(report)
        line = line.split("\n")[0].split("\t")
        if len(line) < 3:
            continue
        chrom = line[0]
        s = int(line[1])
        e = int(line[2])
        iv = HTSeq.GenomicInterval(chrom, s, e)
        model[iv] = float(line[3])
    print
    print datetime.now(), "Model built for %s" % bg
    #return genomic coverage model, chromosomes, reads count and read length
    return model
예제 #10
0
def cluster_genes(genes, chrom_list):
    """cleans overlapping regions, all partially or completely
    overlapping genes are clustered into a single gene
    """

    genes2 = HTSeq.GenomicArrayOfSets(chrom_list, stranded=False)
    region = genes.steps()
    last = set()
    num = 0
    FLAG = False
    iv0 = HTSeq.GenomicInterval("chr1", 0, 1)

    for iv, gene in region:

        if len(gene) == 0:
            if FLAG == False:
                last = set([])
                num = 0
            else:
                genes2[iv0] = last

                last = set()
                num = 0
        else:

            FLAG = True
            last = set.union(last, gene)
            num += 1
            if num > 1:
                iv0.extend_to_include(iv)
            else:
                iv0 = iv

    return genes2
예제 #11
0
def bedToolsInterval2GenomicInterval(bedtool):
    """
    Given a pybedtools.BedTool object, returns dictionary of HTSeq.GenomicInterval objects.
    """
    intervals = OrderedDict()
    for iv in bedtool:
        if iv.strand == "+" or iv.strand == 0 or iv.strand == str(0):
            intervals[iv.name] = HTSeq.GenomicInterval(iv.chrom, iv.start,
                                                       iv.end, "+")
        elif iv.strand == "-" or iv.strand == 0 or iv.strand == str(1):
            intervals[iv.name] = HTSeq.GenomicInterval(iv.chrom, iv.start,
                                                       iv.end, "-")
        else:
            intervals[iv.name] = HTSeq.GenomicInterval(iv.chrom, iv.start,
                                                       iv.end)
    return intervals
예제 #12
0
def get_window_counts_and_normalize(window_dict, tags_dict, genome_data, scaling_factor, total_reads, window_size):
    # dictionary to store read count in each window
    window_counts_dict = {}
    # HTSeq genomic array to store normalized score for each window (used to generate bedgraph file)
    normalized_window_array = HTSeq.GenomicArray(genome_data, stranded=False, typecode='d')

    # create chromosome keys in window counts dictionary for all chromosomes in genome; the values are empty lists
    for chrom in genome_data:
        window_counts_dict[chrom] = []

    # iterate through all chromosomes in the genome
    for chrom in genome_data:
        # iterate through all windows on the chromosome
        for window_start in window_dict[chrom]:
            # get read count in window
            read_count = get_read_count_in_window(chrom, window_start, window_size, tags_dict)

            window_counts_dict[chrom].append([window_start, read_count, 0])

            # calculate normalized read count
            normalized_count = float(read_count) * float(scaling_factor) / float(total_reads)
            window_end = window_start + window_size
            window = HTSeq.GenomicInterval(chrom, window_start, window_end)
            # assign normalized read count to window on HTSeq genomic array
            normalized_window_array[window] = normalized_count

    return window_counts_dict, normalized_window_array
예제 #13
0
def reads_profile(regions, bam_file, size):
    """
    Parses reads from BAM file and adds number of forward and reverse 
    5' coverage counts per position of each region.
    This function depend on the HTSeq package for fast parsing of read infromation from BAM files.
    """

    print "INFO: Begin to parse reads from BAM file for n={0} regions.".format(
        len(regions))

    # Open BAM file:
    bamHandle = HTSeq.BAM_Reader(bam_file)
    # get list of available chromosoms
    chromosomes = set([chr['SN'] for chr in bamHandle.get_header_dict()['SQ']])

    for i, reg in enumerate(regions):

        center = reg["center"]

        # initialize read-counts for all positions of this region
        up_counts = size * [0]
        down_counts = size * [0]

        # check if chr of region is available in BAM file:
        if reg["chr"] in chromosomes:

            # get GenomicInterval object. extend it by +-1 to for including reads on negative strand inside the interval
            iv = HTSeq.GenomicInterval(reg["chr"], max(0,
                                                       reg["ext_start"] - 1),
                                       reg["ext_end"] + 1, reg["strand"])

            # iterate over all reads mapping to that region (interval)
            for aln in bamHandle[iv]:

                # consider motif on positiv stand
                if reg["strand"] == '+':

                    dist = aln.iv.start_d - center
                    pos = dist + size / 2

                    if pos >= 0 and pos < size:
                        if aln.iv.strand == '+': up_counts[pos] += 1
                        if aln.iv.strand == '-': down_counts[pos] += 1

                if reg["strand"] == '-':

                    dist = -1 * (aln.iv.start_d - center)
                    pos = dist + size / 2

                    if pos >= 0 and pos < size:
                        if aln.iv.strand == '+': down_counts[pos] += 1
                        if aln.iv.strand == '-': up_counts[pos] += 1

        # add counts to region dictionary:
        reg["up_counts"] = up_counts
        reg["down_counts"] = down_counts

    print "INFO: Finished parsing of BAM file."

    return regions
예제 #14
0
def getPeakProfiles(sites, bamfile,halfwinwidth=3000):
    bam = ht.BAM_Reader(bamfile)
    #make sure the sites and bam files have same naming convention
    #
    #retrict sites to those that have an entry in the bam file
    #probably has bug dealing with X and Y.
    bamChroms = [x["SN"] for x in bam.get_header_dict()["SQ"]]
    bamChroms = ["chr"+c if c.isdigit() else c for c in bamChroms]
    sites = sites[np.in1d(sites["chrom"], bamChroms)]
    sites.shape
    #
    peakProfs = []
    #collect the sites as genomic intervals
    for i, pos in enumerate(sites):
        print "%d of %d" %(i, len(sites))
        peakProfile = np.zeros(2*halfwinwidth)
        if i % 1000 == 0:
            print "%d of %d" %(i, len(sites))
        #don't change the site notation here
        #sitechr = truncChr(pos["chrom"]) if hasChrPrefix else pos["chrom"]
        sitechr = pos["chrom"]
        window = ht.GenomicInterval( str(sitechr), pos["site"] - halfwinwidth, pos["site"] + halfwinwidth, str(pos["strand"]) )
        #if (list(bam[window])):
        if next(bam[window], None) is not None:
            for almnt in bam[window]:
                if pos["strand"] == "+":
                    a = almnt.iv.start - pos["site"] + halfwinwidth
                    b = almnt.iv.end - pos["site"] + halfwinwidth
                if pos["strand"] == "-":
                    a = pos["site"] + halfwinwidth - almnt.iv.end
                    b = pos["site"] + halfwinwidth - almnt.iv.start
                peakProfile[a:b] += 1
        if (np.sum(peakProfile) > 0):
            peakProfs.append(peakProfile)
    return np.array(peakProfs)
예제 #15
0
def bedTools_interval_to_genomic_interval(bedtool):
    """
    Given a pybedtools.BedTool object, returns dictionary of HTSeq.GenomicInterval objects.
    """
    intervals = OrderedDict()
    for iv in bedtool:
        name = "{}:{}-{}".format(iv.chrom, iv.start, iv.end)
        if iv.strand == "+" or iv.strand == 0 or iv.strand == str(0):
            intervals[name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end,
                                                    "+")
        elif iv.strand == "-" or iv.strand == 0 or iv.strand == str(1):
            intervals[name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end,
                                                    "-")
        else:
            intervals[name] = HTSeq.GenomicInterval(iv.chrom, iv.start, iv.end)
    return intervals
예제 #16
0
    def HTSeq(self, bamlist):
        # Axin2
        #window = HTSeq.GenomicInterval("chr11", 108914532, 108954079, "+")
        # Elf3
        window = HTSeq.GenomicInterval("chr1", 135253574, 135258472, "-")
        coverage = HTSeq.GenomicArray("auto", stranded=True, typecode="i")
        a = []
        samplelist = []
        for bamfile in bamlist:
            sample = os.path.basename(bamfile).split("_")[0]
            marker = sample.split("-")[0]
            samplelist.append(sample)
            bamfile = HTSeq.BAM_Reader(bamfile)
            for almnt in bamfile:
                if almnt.aligned:
                    almnt.iv.length = 1
                    coverage[almnt.iv] += 1

            normalization = np.fromiter(coverage[window],
                                        dtype=float) / p.H3K27ac_bam[sample]
            a.append(normalization)
        b = np.array(a)
        df = pd.DataFrame(b.T)
        df.columns = samplelist
        data = df[[
            "ctrl-H3K27ac", "2weeks-H3K27ac", "4weeks-H3K27ac",
            "7weeks-H3K27ac", "10weeks-H3K27ac"
        ]]
        data.to_csv(
            "/data3/zhaochen/project/colon_cancer/colon_chip/peakUCSCplot/H3K27ac_Elf3.txt",
            sep="\t",
            index=False)
예제 #17
0
def loops2degreesSharp(fin, fout):
    model = HTSeq.GenomicArrayOfSets("auto", stranded=0)
    for i, line in enumerate(open(fin)):
        if i == 0:
            continue
        line = line.split("\n")[0].split("\t")
        iva = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2]))
        ivb = HTSeq.GenomicInterval(line[3], int(line[4]), int(line[5]))
        model[iva] += line[6] + "-left"
        model[ivb] += line[6] + "-right"
    with open(fout, "w") as fo:
        for iv, value in list(model.steps()):
            if value == set([]):
                continue
            line = [iv.chrom, iv.start, iv.end, len(value), ",".join(value)]
            fo.write("\t".join(map(str, line)) + "\n")
예제 #18
0
    def __init__(self, sample_name, genome, cvg, mis,
                 feature_type, start, end, strand, locus, name, extend_utr):
        ''' Constructs Feature object.
'''
        
        self.sample_name = sample_name
        self.genome_id = genome.id
        self.type = feature_type
        self.start = int(start) - 1 
        self.end = int(end)
        self.strand = strand
        if self.type == 'five_prime_UTR': #add nucleotides to the 5'UTR to include the first codons
            if self.strand == '+':
                self.end += extend_utr
            else:
                self.start -= extend_utr
        self.locus = locus
        self.name = name
        self.iv = HTSeq.GenomicInterval(self.genome_id, self.start, self.end, self.strand)
        self.coord = range(self.start, self.end)
        self.seq = genome.seq[self.start : self.end]
        self.cvg = list(cvg[self.iv])
        self.mis = list(mis[self.iv])
        self.dms = [None] * len(self.cvg)
        
        if strand == '-':
            self.coord.reverse()
            self.seq = self.seq.reverse_complement()
            self.cvg.reverse()
            self.mis.reverse()
예제 #19
0
파일: main.py 프로젝트: soh-i/polyN_finder
def find_sgRNA_in_polyc_regoin(fasta, db):
    '''
    Search polyC region that can be targeted by spCas9 (PAM is NGG)
    '''

    p = re.compile(r'C{6}[ATGC]{14}[ATGC][G]{2}')
    result = collections.namedtuple(
        'PolycGuideRnaResult',
        ['chr', 'start', 'end', 'guide', 'PAM', 'score', 'is_exon'])
    with pysam.FastxFile(fasta) as fh:
        for entry in fh:
            for m in p.finditer(entry.sequence):
                start = m.start()
                end = m.end()
                score_seq = entry.sequence[start - 4:end + 3]
                score = calc_doench_score(score_seq)
                seed_seq = entry.sequence[start + 6:end - 3]
                sgRNA = entry.sequence[start:end]
                pam = sgRNA[-3:]
                if filter_homopolymer(seed_seq):
                    query_iv = HTSeq.GenomicInterval(entry.name, start, end,
                                                     '+')
                    is_exon_overlapped = find_exon(query_iv, db)
                    yield result(entry.name, start, end, sgRNA, pam, score,
                                 is_exon_overlapped)
예제 #20
0
def create_peak_gtf(path, exp_design_name, technique, bed_name):
    """
    Read all PATH_PEAKS+'/'+exp_design_name+'_'+technique+'_'+Final.txt
    Combine peaks
    and save to GFF
    :param list_technique:
    :return:
    """
    PATH_ANNOT = path + '/Genome/'
    if technique == '' or technique == 'All':
        PATH_PEAKS = path + '/PeakDetection/Peaks'
        peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '_Peaks.txt'
        gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + bed_name + '.gtf'
    else:
        PATH_PEAKS = path + '/PeakDetection/' + technique + '/'
        peak_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '_Peaks.txt'
        gtf_filename = PATH_PEAKS + '/' + exp_design_name + '_' + technique + '_' + bed_name + '.gtf'

    with open(gtf_filename, 'w') as gtf_file, \
            open(peak_filename, 'rU') as peak_file:
        csv_peaks = csv.DictReader(peak_file, delimiter='\t')
        for row in csv_peaks:
            peak = HTSeq.GenomicInterval(row['chromo_peak'],
                                         int(row['begin_peak']),
                                         int(row['end_peak']), ".")
            peak_id = row['WindowId']
            feature = HTSeq.GenomicFeature(peak_id, 'exon', peak)
            #print(feature.get_gff_line().strip() + '; gene_id \"'+peak_id+'\"')
            gtf_file.write(feature.get_gff_line().strip() + '; gene_id \"' +
                           peak_id + '\"' + '\n')
예제 #21
0
 def intersectcirc(self, circ_file, modified_gtf_file, strand=True):
     # imput the result file of print_start_end_file
     #intersectBed -a start.bed -b Drosophila_melanogaster.BDGP5.75.exon_id.dedup.gtf -wa -wb -loj > tmpintersect.2
     circ = pybedtools.BedTool(circ_file)
     gtf = pybedtools.BedTool(modified_gtf_file)
     if strand:
         intersectfile = circ.intersect(gtf,
                                        wa=True,
                                        wb=True,
                                        loj=True,
                                        s=True,
                                        nonamecheck=True)
     else:
         intersectfile = circ.intersect(gtf,
                                        wa=True,
                                        wb=True,
                                        loj=True,
                                        nonamecheck=True)
     # Store circExons as: circle start or end intervals as key, custom_exon_id as value
     circExons = {}
     for lin in intersectfile:
         lin_split = str(lin).split('\t')
         if lin_split[14].strip('\n') == '.':
             #lin_split[11] = ''
             pass
         else:
             circExons.setdefault(
                 HTSeq.GenomicInterval(lin_split[0], int(lin_split[1]),
                                       int(lin_split[2]), lin_split[5]),
                 set()).add(
                     HTSeq.parse_GFF_attribute_string(
                         lin_split[14])['custom_exon_id'])
         #circExons.setdefault( HTSeq.GenomicInterval(lin_split[0],int(lin_split[1]),int(lin_split[2]),lin_split[9]), [] ).append( { HTSeq.GenomicInterval(lin_split[3],int(lin_split[6]),int(lin_split[7]),lin_split[9]):HTSeq.parse_GFF_attribute_string(lin_split[11]) })
     return circExons
예제 #22
0
	def __iter__(self):
		for chromosome_name,chromosome_obj in self.gas.chrom_vectors.items():
			for gene in list(reduce(lambda s1, s2: s1 | s2, [x[1] for x in self.gas[HTSeq.GenomicInterval(chromosome_name,0,chromosome_obj['.'].iv.end)].steps()])):
				yield gene
	
	#def show_me(self):
	#	print self.__str__()
예제 #23
0
def quantify(readF, peakF, fnOut):
    print("builidng coverage model for counting")
    covModel, t = buildCovModel(readF)
    r = set()
    ds = {}
    print("counting reads in peaks")
    for line in tqdm(list(open(peakF))):
        line = line.split("\n")[0].split("\t")
        iv = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2]))
        c = set()
        for ivb, vb in covModel[iv].steps():
            try:
                c.add(vb)
            except:
                continue
        r.update(list(c))
        c = list(c)
        ds["|".join(line[:3])] = {
            "count": len(c),
            "RPKM": len(c) / 1.0 / iv.length / t * 10**9,
            "TPM": len(c) / 1.0 / iv.length * 10**3,
            "length": iv.length
        }
    ds = pd.DataFrame(ds).T
    ds["TPM"] = ds["TPM"] / ds["TPM"].sum() * 10**6
    ds.to_csv(fnOut, sep="\t", index_label="peakId")
예제 #24
0
def binData(tmodel, cmodel, tr, cr, chroms, binsize):
    """
    Get bins for the genome and get the reads count.
    tr is read length for treatment
    cr is read length for control
    """
    ts = []
    cs = []
    for chrom in chroms.keys():
        s = chroms[chrom]["s"]
        e = chroms[chrom]["e"]
        bins = (e - s) / binsize
        for i in xrange(1, bins):
            iv = HTSeq.GenomicInterval(chrom, s + binsize * (i - 1),
                                       s + binsize * i)
            countT = getCount(tmodel, iv, tr)
            countC = getCount(cmodel, iv, cr)
            if countT + countC == 0:
                #if countT * countC == 0:
                continue
            else:
                ts.append(countT)
                cs.append(countC)
    ts, cs = np.array(ts), np.array(cs)
    return ts, cs
예제 #25
0
def countFeatures(f, featuref, paired=True):
    """
    Count reads enrichment at the TSS regions.
    """
    #logger.info("Building coverage for %s"%f)
    t, model = getCov(f, paired=paired)
    if t == 0:
        return None
    logger.info("%s reads from %s" % (t, f))
    ds = {}
    logger.info("Caculating enriched reads at target regions of %s" % featuref)
    r = 0
    for line in tqdm(open(featuref).read().split("\n")):
        line = line.split("\n")[0].split("\t")
        if len(line) < 3:
            continue
        s = int(line[1])
        e = int(line[2])
        iv = HTSeq.GenomicInterval(line[0], s, e)
        c, rpkm = getCount(t, model, iv)
        r += c
    n = f.split("/")[-1].split(".bedpe")[0]
    logger.info("FLAG!\t %s:: total:%s,inPeaks:%s,inPeaksRatio:%s" %
                (n, t, r, r / 1.0 / t))
    return n, t, r, r / 1.0 / t
예제 #26
0
def load_bedfile_to_ga(bed_file):

    ga = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    with open(bed_file, "r") as fh:

        for line in fh:

            row = line.strip().split("\t")

            # field values based on the bed files from DBTSS. BED files are 0 based, no adjusting necessary.
            try:
                chrom = row[0]
                pos = int(row[1])
                strand = row[5]
                score = float(row[4])
            except IndexError as e:
                print(e)
                print(row)
                continue

            try:
                ga[HTSeq.GenomicInterval(chrom, pos - 1, pos, strand)] = score
            except ValueError as e:
                print("Error loading GA:")
                print(row)
                continue
    return ga
예제 #27
0
 def add_raw_reads_to_utr(self, ga, chr_len):
     #utr_left = self.cds_right
     #utr_right = self.txpt_right
     # Need chrom information.
     self.utr_arr = []
     if self.txpt_right - self.cds_right < 2:
         for pos in range(0, self.txpt_right - self.cds_right + 1, 1):
             self.utr_arr = [0]
         return
     if self.strand == '-':
         txpt_left = chr_len[self.chrom] - self.txpt_right + 1
         txpt_right = chr_len[self.chrom] - self.txpt_left
         cds_left = chr_len[self.chrom] - self.cds_right + 1
         cds_right = chr_len[self.chrom] - self.cds_left
     else:
         txpt_left = self.txpt_left
         txpt_right = self.txpt_right
         cds_left = self.cds_left
         cds_right = self.cds_right
     iv = HTSeq.GenomicInterval(self.chrom, cds_right, txpt_right,
                                self.strand)
     self.utr_ga = HTSeq.GenomicArray(chroms='auto', stranded=True)
     # if len(ga[iv].steps()) == 0:
     #    for pos in range(0,self.txpt_right - self.cds_right + 1,1):
     #        self.utr_arr.append(0)
     #    return
     if txpt_right - cds_right < 2:
         self.utr_arr = [0]
         return
     for _iv, score in ga[iv].steps():
         self.utr_ga[_iv] = score
         left_in_utr = _iv.start - cds_right
         right_in_utr = _iv.end - cds_right
         for pos in range(left_in_utr, right_in_utr, 1):
             self.utr_arr.append(score)
def get_gene_features(gtfFile, id_type, feature_type):
    '''
	get exon features and gene interval features
	'''
    features = HTSeq.GenomicArrayOfSets("auto", stranded="yes")
    geneFeatures = HTSeq.GenomicArrayOfSets("auto", stranded="yes")
    geneRange = {}
    gtf = HTSeq.GFF_Reader(gtfFile)
    i = 0
    for line in gtf:
        if line.type == feature_type:
            feature_id = line.attr[id_type]
            features[line.iv] += feature_id
            if feature_id not in geneRange:
                geneRange[feature_id] = [line.iv.chrom, 0, 0, line.iv.strand]
            if geneRange[feature_id][1] != 0:
                geneRange[feature_id][1] = min(geneRange[feature_id][1],
                                               line.iv.start)
            else:
                geneRange[feature_id][1] = line.iv.start
            geneRange[feature_id][2] = max(geneRange[feature_id][2],
                                           line.iv.end)
        i += 1
        if i % 100000 == 0:
            print("%d GFF lines processed.\n" % i, file=sys.stderr)
    for g, v in geneRange.items():
        chrom, start, end, strand = v
        tmp_iv = HTSeq.GenomicInterval(chrom, start, end, strand)
        geneFeatures[tmp_iv] += g
    return features, geneFeatures
예제 #29
0
    def _transform_pe_observed_data(self, aln_stat: AlignStat) -> np.ndarray:
        """Transform paired-end alignments pair to observed data."""
        cf = self._calc_compatible_factor(aln_stat)

        aln1, aln2 = aln_stat.aln, aln_stat.aln2
        start = min(aln1.iv.start, aln2.iv.start)
        end = max(aln1.iv.end, aln2.iv.end)
        fragment_iv = HTSeq.GenomicInterval(self.segment.iv.chrom, start, end, '.')

        # Inferred insert sizes for each isoform.
        inferred_insert_sizes = np.repeat([0], self.isoforms_count)
        for iv, value in self.region[fragment_iv].steps():
            isoform_nums = [v for v in value if isinstance(v, int)]
            for i in isoform_nums:
                inferred_insert_sizes[i] += (iv.end - iv.start)

        iis = inferred_insert_sizes.astype(float)
        iis[iis == 0] = -np.inf

        # Mappable fragments count.
        c = self.isoform_lens.astype(float) - iis + 1
        c[c <= 0] = np.inf
        p = scipy.stats.norm.pdf(iis, self.bam_param.insert_size_mean, self.bam_param.insert_size_std)
        c = 1 / c * p

        data = cf * np.tile(c, self.ploidy)
        return data
예제 #30
0
    def fetch(self, interval, strand=None):
        """ 
        Retrieve all reads within a given window
        
        Parameters
        ----------
        interval : list, tuple or str
            If interval is a list or tuple, it should contain chromosome (str), 
            start (int), end (int). If it is a string, it should be of the
            format chrom:start-end
        
        strand : str, optional
            Either '+' or '-'. By default all reads are returned.
        
        Yields
        ------
        GenomicInterval
            Yields HTSeq GenomicInterval objects.
        """

        feature = self._interval_bedtool(interval, strand=strand)
        chrom, start, end = self._get_interval(interval)
        for read in self.track.intersect(feature,
                                         u=True,
                                         stream=True,
                                         s=strand in ["+", "-"]):
            yield HTSeq.GenomicInterval(chrom, read.start, read.end,
                                        str(read.strand))