コード例 #1
0
def get_gene_features(gtfFile, id_type, feature_type):
    '''
	get exon features and gene interval features
	'''
    features = HTSeq.GenomicArrayOfSets("auto", stranded="yes")
    geneFeatures = HTSeq.GenomicArrayOfSets("auto", stranded="yes")
    geneRange = {}
    gtf = HTSeq.GFF_Reader(gtfFile)
    i = 0
    for line in gtf:
        if line.type == feature_type:
            feature_id = line.attr[id_type]
            features[line.iv] += feature_id
            if feature_id not in geneRange:
                geneRange[feature_id] = [line.iv.chrom, 0, 0, line.iv.strand]
            if geneRange[feature_id][1] != 0:
                geneRange[feature_id][1] = min(geneRange[feature_id][1],
                                               line.iv.start)
            else:
                geneRange[feature_id][1] = line.iv.start
            geneRange[feature_id][2] = max(geneRange[feature_id][2],
                                           line.iv.end)
        i += 1
        if i % 100000 == 0:
            print("%d GFF lines processed.\n" % i, file=sys.stderr)
    for g, v in geneRange.items():
        chrom, start, end, strand = v
        tmp_iv = HTSeq.GenomicInterval(chrom, start, end, strand)
        geneFeatures[tmp_iv] += g
    return features, geneFeatures
コード例 #2
0
 def init_GenomicArrayOfSets_and_Counter_for_quant_IRC(self):
         genes = HTSeq.GenomicArrayOfSets("auto", stranded=self.stranded)
         gene_region = HTSeq.GenomicArrayOfSets("auto", stranded=self.stranded)
         CER_region = HTSeq.GenomicArrayOfSets("auto", stranded=self.stranded)                
                
         gene_counts = collections.defaultdict( lambda:  collections.Counter()) 
         CIR_counts = collections.defaultdict( lambda:  collections.defaultdict( lambda:  collections.Counter() ))      
         CJ_counts = collections.defaultdict( lambda:  collections.defaultdict( lambda:  collections.Counter() ))
         for feature in self.gtffile:
                 gene_id = feature.attr["gene_id"]
                 if feature.type == "gene_region":
                         gene_region[feature.iv] += gene_id
                 elif feature.type == "constitutive_exonic_region" and gene_id in self.valid_genes:
                         CER_number = feature.attr["constitutive_exonic_region_number"]
                         genes[feature.iv] += (gene_id, feature.type, CER_number)
                         CER_region[feature.iv] += "constitutive_exonic_region"
                 elif feature.type == "constitutive_intronic_region" and gene_id in self.valid_genes:
                         CIR_number = feature.attr["constitutive_intronic_region_number"]
                         genes[feature.iv] += (gene_id, feature.type, CIR_number)
                         if self.CIR_has_both_upstream_and_downstream_CERs(feature):
                                 CIR_counts[gene_id][CIR_number]["CIR_5'retained_reads"] = 0
                                 CIR_counts[gene_id][CIR_number]["CIR_3'retained_reads"] = 0
                                 CIR_counts[gene_id][CIR_number]["CIR_spliced_reads"] = 0                                        
                 elif feature.type == "constitutive_junction" and gene_id in self.valid_genes:
                         CJ_number = feature.attr["constitutive_junction_number"]
                         CJ_counts[gene_id][CJ_number]["CJ_retained_reads"] = 0
                         CJ_counts[gene_id][CJ_number]["CJ_spliced_reads"] = 0                                
                         
         return genes, gene_region, CER_region, gene_counts, CIR_counts, CJ_counts
コード例 #3
0
def loops2degreesBroad(fin, fout):
    if os.path.isfile(fout):
        return
    model = HTSeq.GenomicArrayOfSets("auto", stranded=0)
    model2 = HTSeq.GenomicArrayOfSets("auto", stranded=0)
    for i, line in enumerate(open(fin)):
        line = line.split("\n")[0].split("\t")
        iva = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2]))
        ivb = HTSeq.GenomicInterval(line[3], int(line[4]), int(line[5]))
        try:
            model[iva] += 1
            model[ivb] += 1
            model2[iva] += line[6] + "-left"
            model2[ivb] += line[6] + "-right"
        except:
            print fin, line
    with open(fout, "w") as fo:
        for iv, value in list(model.steps()):
            if value == set([]):
                continue
            ds = set()
            for ivb, valueb in model2[iv].steps():
                ds.update(valueb)
            ds = list(ds)
            line = [iv.chrom, iv.start, iv.end, len(ds), ",".join(ds)]
            fo.write("\t".join(map(str, line)) + "\n")
コード例 #4
0
ファイル: MPEseq.py プロジェクト: mgildea87/4tu_MPEseq
def readTargetFeatures(interval, Branch_windows, Branch_to3ss):
    intron_set = set()
    fiveSS = {}
    threeSS = {}
    targets = HTSeq.GenomicArrayOfSets('auto', stranded=True)
    #strands are switched here. MPE-seq reads are on the opposite strand
    for line in open(interval):
        fields = line.rstrip().split('\t')
        if fields[5] == '+':
            fields[5] = '-'
        else:
            fields[5] = '+'


#2 is subtracted and 1 is added such that a read must go atleast 3bp into the intron to be called unspliced
        if fields[5] == '-':
            iv = HTSeq.GenomicInterval(fields[0],
                                       int(fields[1]) - 1,
                                       int(fields[2]) - 2, fields[5])
        else:
            iv = HTSeq.GenomicInterval(fields[0],
                                       int(fields[1]) + 1, int(fields[2]),
                                       fields[5])
        targets[iv] += fields[3]
        intron_set.add(fields[3])
        #5'SS and 3'SS are swithed here because MPE-seq reads are on the opposite strand and the orientation was switched above
        if fields[5] == '+':
            fiveSS[(fields[0], int(fields[2]),
                    fields[5])] = tuple(fields[3].split(';'))
            threeSS[(fields[0], int(fields[1]),
                     fields[5])] = tuple(fields[3].split(';'))
        else:
            fiveSS[(fields[0], int(fields[1]),
                    fields[5])] = tuple(fields[3].split(';'))
            threeSS[(fields[0], int(fields[2]),
                     fields[5])] = tuple(fields[3].split(';'))

    Branches = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    for line in open(Branch_windows):
        fields = line.rstrip().split('\t')
        iv = HTSeq.GenomicInterval(fields[1], int(fields[2]), int(fields[3]))
        Branches[iv] += fields[0]

    Branchto3ss = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    for line in open(Branch_to3ss):
        fields = line.rstrip().split('\t')
        iv = HTSeq.GenomicInterval(fields[1], int(fields[2]), int(fields[3]))
        Branchto3ss[iv] += fields[0]

    return targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss
コード例 #5
0
ファイル: m6a_utils.py プロジェクト: becavin-lab/MeRIPSeq
def get_protein_coding_gtf():
    gtf_file = HTSeq.GFF_Reader(PATH_ANNOT + "gencodeVM13/gencode.vM13.annotation.transcript.gtf", end_included=True)
    windows = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    with open(PATH_ANNOT + "gencode.vM9.annotation.protein_coding.gtf", "w") as slidingGTF:
        for feature in gtf_file:
            if feature.attr['gene_type'] == "protein_coding":
                slidingGTF.write(feature.get_gff_line())
コード例 #6
0
def cluster_genes(genes, chrom_list):
    """cleans overlapping regions, all partially or completely
    overlapping genes are clustered into a single gene
    """

    genes2 = HTSeq.GenomicArrayOfSets(chrom_list, stranded=False)
    region = genes.steps()
    last = set()
    num = 0
    FLAG = False
    iv0 = HTSeq.GenomicInterval("chr1", 0, 1)

    for iv, gene in region:

        if len(gene) == 0:
            if FLAG == False:
                last = set([])
                num = 0
            else:
                genes2[iv0] = last

                last = set()
                num = 0
        else:

            FLAG = True
            last = set.union(last, gene)
            num += 1
            if num > 1:
                iv0.extend_to_include(iv)
            else:
                iv0 = iv

    return genes2
コード例 #7
0
def get_overlapping_gene_names(reference, is_stranded):
    '''is_stranded: Whether a transcript on the opposite strand should be considered overlapping. If is_stranded=True, only consider genes on same strand as overlapping. 
    Returns: lists of names of genes that overlap another gene or do not overlap another gene (strand-specific or not as specified in input)'''
    all_gene_names = set()
    gene_gas = HTSeq.GenomicArrayOfSets(chroms='auto', stranded=is_stranded)
    for gene_name, gene in reference.genes.iteritems():
        all_gene_names.add(gene_name)

        for transcript in gene.transcripts:
            if gene_name == 'ZEB2-AS1':
                if transcript.iv.strand == '-':  #ZEB2 AS transcript strand was misannotated in iGenomes UCSC hg19.
                    #print 'Switching ZEB2-AS1 transcript strand to + (misannotated!)'
                    transcript.iv.strand = '+'
                #else:
                #    print "ZEB2-AS1 is on '+' strand, %r" % transcript
            gene_gas[transcript.iv] += gene_name

    overlapping_genes = set()
    for gene_name, gene in reference.genes.iteritems():
        for transcript in gene.transcripts:

            for _, genes_in_region in gene_gas[transcript.iv].steps():
                if len(genes_in_region) > 1:
                    overlapping_genes.update(genes_in_region)

    non_overlapping_genes = all_gene_names - overlapping_genes

    return overlapping_genes, non_overlapping_genes
コード例 #8
0
def get_features(gff_filename, stranded, feature_type, id_attribute, quiet):
      
   features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )     
   counts = {}
     
   gff = HTSeq.GFF_Reader( gff_filename )   
   i = 0
   try:
      for f in gff:
         if f.type == feature_type:
            try:
               feature_id = f.attr[ id_attribute ]
            except KeyError:
               raise ValueError, ( "Feature %s does not contain a '%s' attribute" % 
                  ( f.name, id_attribute ) )
            if stranded != "no" and f.iv.strand == ".":
               raise ValueError, ( "Feature %s at %s does not have strand information but you are "
                  "running htseq-count in stranded mode. Use '--stranded=no'." % 
                  ( f.name, f.iv ) )
            features[ f.iv ] += feature_id
            counts[ f.attr[ id_attribute ] ] = 0
         i += 1
         if i % 100000 == 0 and not quiet:
            sys.stderr.write( "%d GFF lines processed.\n" % i )
   except:
      sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
      raise
      
   if not quiet:
      sys.stderr.write( "%d GFF lines processed.\n" % i )
      
   if len( counts ) == 0:
      raise Exception( "No features of type '%s' found.\n" % feature_type )
   
   return (features, counts)
コード例 #9
0
ファイル: coverage.py プロジェクト: wwick/coverageCalc
def htseq():
    gff_file = HTSeq.GFF_Reader(gff, end_included=True)
    genes = HTSeq.GenomicArrayOfSets("auto", stranded=True)

    for gene in gff_file:
        if gene.type == "gene":
            genes[gene.iv] += gene.attr["ID"]

    tasks = []
    results = []
    out_q = Queue()
    for tp in range(1, num_tp + 1):
        for rep in range(1, num_reps + 1):
            task = Process(target=htseq_count, args=(genes, rep, tp, out_q))
            tasks.append(task)
            task.start()
    for i in range(len(tasks)):
        results.append(out_q.get())
    for task in tasks:
        task.join()
    counts = range(len(results))
    for result in results:
        counts[result[1]] = result[0]
    keys_a = set(counts[0].keys())
    for sample in counts:
        keys_b = set(sample.keys())
        keys_a = keys_a & keys_b
    keys = list(keys_a)
    matrix = np.zeros(shape=(len(keys), len(counts)), dtype=np.int_)
    for key_index in range(len(keys)):
        for index in range(len(counts)):
            matrix[key_index, index] = counts[index][keys[key_index]]
    np.savetxt(htseq_out, matrix, delimiter=",")
コード例 #10
0
def get_gtf(gtf_file=None):
    gtf = HTSeq.GFF_Reader(gtf_file)
    features = HTSeq.GenomicArrayOfSets('auto', stranded=True)
    for feature in gtf:
        if feature.type == 'exon':
            features[feature.iv] += feature.attr['gene_id']
    return features
コード例 #11
0
def calculateFPKM(bam, gtf, readsNum=1, geneType="tRNA"):
    import HTSeq
    gtf = HTSeq.GFF_Reader(gtf, end_included=True)
    bam = HTSeq.BAM_Reader(bam)
    genes = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    counts = {}
    lengthOfReads = {}
    for feature in gtf:
        if "gene_biotype" in feature.attr.keys():
            if feature.type == "gene" and feature.attr[
                    "gene_biotype"] == geneType:
                genes[feature.iv] += feature.name
                counts[feature.name] = 0
                lengthOfReads[feature.name] = feature.iv.end - feature.iv.start
    for aln in bam:
        if aln.aligned:
            iset = None
            for iv2, step_set in genes[aln.iv].steps():
                if iset is None:
                    iset = step_set.copy()
                else:
                    iset.intersection_update(step_set)
            if len(iset) == 1:
                counts[list(iset)[0]] += 1
    rpkm = {}
    for key in counts.keys():
        #rpkm reads per million per kilobase
        rpkm[key] = counts[key] * 1000 / (lengthOfReads[key] * readsNum)
    return (rpkm)
コード例 #12
0
def getCov(f, paired=True):
    logger.info("Building coverage model for %s, paired=%s" % (f, paired))
    model = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    i = None
    uniqs = set()
    for i, line in enumerate(gzip.open(f)):
        if i % 10000 == 0:
            cFlush("%s read from %s" % (i, f))
        line = line.split("\n")[0].split("\t")
        if paired and line[0] != line[3]:
            continue
        if paired:
            s = min(int(line[1]), int(line[4]))
            e = max(int(line[2]), int(line[5]))
        else:
            s = int(line[1])
            e = int(line[2])
        r = (line[0], s, e)
        if r not in uniqs:
            iv = HTSeq.GenomicInterval(line[0], s, e)
            model[iv] += str(i)
            uniqs.add(r)
    if i is None:
        logger.error("ERROR! No read in %s." % f)
        return 0, None
    logger.info("%s read from %s, unique %s" % (i, f, len(uniqs)))
    return len(uniqs), model
コード例 #13
0
def load_bedfile_to_ga(bed_file):

    ga = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    with open(bed_file, "r") as fh:

        for line in fh:

            row = line.strip().split("\t")

            # field values based on the bed files from DBTSS. BED files are 0 based, no adjusting necessary.
            try:
                chrom = row[0]
                pos = int(row[1])
                strand = row[5]
                score = float(row[4])
            except IndexError as e:
                print(e)
                print(row)
                continue

            try:
                ga[HTSeq.GenomicInterval(chrom, pos - 1, pos, strand)] = score
            except ValueError as e:
                print("Error loading GA:")
                print(row)
                continue
    return ga
コード例 #14
0
ファイル: GeneAnnotation.py プロジェクト: yhoogstrate/fuma
	def __init__(self,name):
		self.n = 0
		self.name = name
		
		# list(db.region(region=('2L', 9277, 10000), completely_within=True))
		#self.gas2 = gffutils.create_db(gtf, dbfn=db_file)
		self.gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)
コード例 #15
0
    def parse(self):
        idx = HTSeq.GenomicArrayOfSets("auto", stranded=False)
        if self.gtf_file:  # could be None of no gtf file is provided
            log.info("Loading " + self.gtf_file)

            gtf_file = HTSeq.GFF_Reader(self.gtf_file, end_included=True)
            n = 0

            for feature in gtf_file:
                if feature.type == "gene":
                    if 'gene_name' in feature.attr:
                        name = feature.attr['gene_name']
                    elif 'Name' in feature.attr:
                        name = feature.attr['Name']
                    elif 'gene' in feature.attr:
                        name = feature.attr['gene']
                    else:
                        name = feature.name

                    if feature.iv.chrom[0:3] == 'chr':
                        feature.iv.chrom = feature.iv.chrom[3:]

                    idx[feature.iv] += name
                    n += 1

            log.info("Loaded " + str(n) + " features")
        return idx
コード例 #16
0
def readGTF(gtfFile):
	gtf = HTSeq.GFF_Reader(gtfFile)
	start_codon_sites = {}  
	stop_codon_sites = {} 
	CDS_features = HTSeq.GenomicArrayOfSets("auto", stranded="no") 
	i = 0
	for f in gtf:
		i += 1
		if i % 10000 == 0:
			sys.stderr.write("%d GFF lines processed.\r" % i)
		gname = f.attr['gene_id']
		if f.type == "CDS":
			CDS_features[f.iv] += gname
		if f.type == "start_codon":
			if gname not in start_codon_sites:
				start_codon_sites[gname] = f.iv.start_d 
			else:
				if f.iv.strand == "+":
					start_codon_sites[gname] = min(f.iv.start, start_codon_sites[gname])
				else:
					start_codon_sites[gname] = max(f.iv.start_d, start_codon_sites[gname])
		if f.type == "stop_codon":
			if gname not in stop_codon_sites:
				stop_codon_sites[gname] = f.iv.end_d
			else:
				if f.iv.strand == "+":
					stop_codon_sites[gname] = max(f.iv.end, stop_codon_sites[gname])
				else:
					stop_codon_sites[gname] = min(f.iv.end_d, stop_codon_sites[gname])
	return start_codon_sites, stop_codon_sites, CDS_features
コード例 #17
0
def main(arglist):

    probe_csv = snakemake.input['probe_csv']
    kmer_homology_files = snakemake.input['kmer_homology_files']
    target_homology_files = snakemake.input['target_homology_files']
    filtered_probe_csv = snakemake.output['filtered_probe_csv']

    alnmts = []
    for i in range(0, len(kmer_homology_files)):
        target_blast_df = pd.read_csv(target_homology_files[i])
        kmer_blast_df = pd.read_csv(kmer_homology_files[i])
        regions = [
            create_genomic_interval(x, y, z) for x, y, z in zip(
                target_blast_df['sstart'], target_blast_df['send'],
                target_blast_df['sseqid'])
        ]
        rRNA_genes = HTSeq.GenomicArrayOfSets("auto", stranded=True)
        for r in regions:
            #only count if on the same strand as the rRNA:
            if r.strand == '+':
                rRNA_genes[r] += 'rRNA'

        filt_df = overlapper(kmer_blast_df,
                             rRNA_genes,
                             discard_minus_strand=True,
                             mode='discard')
        alnmts.append(set(filt_df['qseqid'].tolist()))

    bad_kmers = set.union(*alnmts)
    df = pd.read_csv(probe_csv, index_col='unique_id')
    df['passed_homology_screen'] = ~df.index.isin(bad_kmers)
    df[df['passed_homology_screen']].to_csv(filtered_probe_csv)
コード例 #18
0
def cook_anno_model(gff_fpath,
                    feature_atrr='gene_id',
                    feature_type='exon',
                    gene_types=(),
                    stranded=True,
                    dumpto=None,
                    verbose=False):
    '''
    Prepare a feature model.

    Output: (features, exported_genes) where:
        - features: HTSeq.GenomicArrayOfSets()
        - exported_genes: a sorted list

    For example, feature_atrr = 'gene_name', feature_type = 'exon',
    gene_types = ('protein_coding', 'lincRNA'):
        - features: all exons ~ all gnames mapping and ready for counting
        - exported_genes: only protein_coding and lincRNA gnames are visible
    Quantification used the full genes but only the selected genes are reported.
    '''
    features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded)
    fh_gff = HTSeq.GFF_Reader(gff_fpath)
    exported_genes = set()
    i = 0
    for gff in fh_gff:
        if verbose and i % 100000 == 0:
            print_logger('Processing {:,} lines of GFF...'.format(i))
        i += 1

        if gff.type != feature_type:
            continue

        features[gff.iv] += gff.attr[feature_atrr].strip()

        if not feature_atrr.startswith('gene'):
            exported_genes.add(gff.attr[feature_atrr].strip())
            continue

        if not gene_types:
            exported_genes.add(gff.attr[feature_atrr].strip())
            continue

        if gff.attr.get('gene_biotype', None) in gene_types:
            exported_genes.add(gff.attr[feature_atrr].strip())

    print_logger('Processed {:,} lines of GFF...'.format(i))

    # Use genometools to select exported_genes
    # if gene_types:
    #     exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types))
    #     exported_genes = list(exported_genes['name'].values)

    if exported_genes:
        exported_genes = tuple(sorted(exported_genes))
    if dumpto:
        with open(dumpto, 'wb') as fh:
            pickle.dump((features, exported_genes), fh)
    return ((features, exported_genes))
コード例 #19
0
def Get_label_information(label, annot, bam_reader):
    warnings.simplefilter("ignore")
    gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    ga = HTSeq.GenomicArray("auto", stranded=False, typecode="i")
    gene_count = {}
    for feature, rank, chrom, start, end, strand, length, exon_rank_left, exon_rank_right in annot[
            label]:
        iv = HTSeq.GenomicInterval(chrom, start, end, strand)
        gas[iv] += (feature, rank)
        gene_count[(feature, rank)] = 0
    boundary_left, boundary_right = min([i[3] for i in annot[label]
                                         ]), max([i[4] for i in annot[label]])
    region_fetch = annot[label][0][2] + ":" + str(
        int(boundary_left) - 500) + "-" + str(int(boundary_right) + 500)
    read_seq = bam_reader.fetch(region=region_fetch)
    read_seq_iter = iter(bam_reader.fetch())
    one_read = next(read_seq_iter)
    pe_mode = one_read.paired_end
    if pe_mode:
        read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq)
    for a in read_seq:
        if not pe_mode:
            if not a.aligned:
                continue
            if a.optional_field('NH') > 1:
                continue
            iv_seq = (cigop.ref_iv for cigop in a.cigar
                      if cigop.type == "M" and cigop.size > 0)
        else:
            if ((a[0] and a[0].aQual < minaqual)
                    or (a[1] and a[1].aQual < minaqual)):
                continue
            if ((a[0] and a[0].optional_field('NH') > 1)
                    or (a[1] and a[1].optional_field('NH') > 1)):
                continue
            if a[0] is not None and a[0].aligned:
                iv_seq = (cigop.ref_iv for cigop in a[0].cigar
                          if cigop.type in cigar_char and cigop.size > 0)
            else:
                iv_seq = tuple()
            if a[1] is not None and a[1].aligned:
                iv_seq = itertools.chain(
                    iv_seq, (invert_strand(cigop.ref_iv)
                             for cigop in a[1].cigar
                             if cigop.type in cigar_char and cigop.size > 0))
        feature_aligned = set()
        for iv in iv_seq:
            for iv2, val2 in gas[iv].steps():
                feature_aligned |= val2
                ga[iv] += 1  # for calculating coverage
        if len(feature_aligned) == 0:
            continue
        for f in [item for item in feature_aligned if item[0] == 'intron']:
            gene_count[f] += 1
        if 'intron' not in [x for x, y in feature_aligned]:
            for f in feature_aligned:
                gene_count[f] += 1
    return gas, ga, gene_count
コード例 #20
0
def _get_features_file(GTF_filename, stranded_info=False):
    exon_annotation_features = HTSeq.GenomicArrayOfSets('auto',
                                                        stranded=stranded_info)
    gtf_annotation = HTSeq.GFF_Reader(GTF_filename)

    for feature in gtf_annotation:
        if feature.type == 'exon':
            exon_annotation_features[feature.iv] += feature.attr['gene_id']
    return exon_annotation_features
コード例 #21
0
    def readGenomeAnnotation(self, args):

        self.genomeAnnotation = HTSeq.GFF_Reader( args.gff, end_included=True )

        self.features = HTSeq.GenomicArrayOfSets("auto", stranded=True)

        for feature in self.genomeAnnotation:
            if feature.type == "gene":
                self.features[feature.iv] += self.getFeatureID(feature)
コード例 #22
0
ファイル: htseq_util.py プロジェクト: shouldsee/htseq_ext
def read_gff(
    gff_filename,
    feature_type=['CDS'],
    id_attribute='Parent',
    additional_attributes=[],
    quiet=0,
    head=-1,
    stranded='yes',
):
    '''Adapter from HTSeq-count
    '''

    if isinstance(feature_type, basestring):
        feature_type = [feature_type]


#     gff_filename = FNAME
    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    gff = HTSeq.GFF_Reader(gff_filename)
    counts = {}
    attributes = {}
    i = 0
    ids = collections.OrderedDict()
    try:
        for f in gff:
            if f.type in feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    raise ValueError(
                        "Feature %s does not contain a '%s' attribute" %
                        (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    raise ValueError(
                        "Feature %s at %s does not have strand information but you are "
                        "running htseq-count in stranded mode. Use '--stranded=no'."
                        % (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
                attributes[f.attr[id_attribute]] = [
                    f.attr[attr] if attr in f.attr else ''
                    for attr in additional_attributes
                ]
                ids.setdefault(feature_id, [])
                ids[feature_id] += [f.iv]
            i += 1
            if i % 100000 == 0 and not quiet:
                sys.stderr.write("%d GFF lines processed.\n" % i)
                sys.stderr.flush()
            if head >= 0:
                if len(ids) == head:
                    break
    except:
        sys.stderr.write("Error occured when processing GFF file (%s):\n" %
                         gff.get_line_number_string())
        raise
    return features, ids
コード例 #23
0
def getGeneModel():
    genes = getGenes()
    model = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    for g in genes.keys():
        iv = HTSeq.GenomicInterval(genes[g]["chr"],
                                   genes[g]["start"],
                                   genes[g]["end"],
                                   strand=genes[g]["strand"])
        model[iv] += g
    return model, genes
コード例 #24
0
 def __init__(self, gtf_file, attribute_name):
     lg.debug('Using HTSeq for annotation.')
     self.loci = OrderedDict()
     self.features = HTSeq.GenomicArrayOfSets("auto", stranded=False)
     for f in HTSeq.GFF_Reader(gtf_file, end_included=True):
         if f.type == 'exon':
             self.features[f.iv] += f.attr[attribute_name]
             if f.attr[attribute_name] not in self.loci:
                 self.loci[f.attr[attribute_name]] = list()
             self.loci[f.attr[attribute_name]].append(f)
コード例 #25
0
def buildCovModel(readF):
    """
    readF: bed.gz
    """
    model = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    for i, line in tqdm(enumerate(gzip.open(readF, 'rt'))):
        line = line.split("\n")[0].split("\t")
        iv = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2]))
        model[iv] = i
    return model, i
コード例 #26
0
ファイル: annotation_cmd.py プロジェクト: zhouhaozeng/IRTools
def find_gene_region_length(gene_region, transcript_region):
    gene_region_length = collections.Counter()
    for gene_id in transcript_region.keys():
        transcripts = HTSeq.GenomicArrayOfSets("auto", stranded=True)
        for transcript_id in transcript_region[gene_id].keys():
            transcripts[transcript_region[gene_id][transcript_id]] += gene_id
        for iv, step_set in transcripts[gene_region[gene_id]].steps():
            if len(step_set) != 0:
                gene_region_length[gene_id] += iv.length
    return gene_region_length
コード例 #27
0
def extract_GTF_features(file_path,
                         feature_type='exon',
                         attribute_label='gene_id'):
    gtf_file = HTSeq.GFF_Reader(file_path)
    output = HTSeq.GenomicArrayOfSets("auto", stranded=True)

    for feature in gtf_file:
        if feature.type == feature_type:
            output[feature.iv] += feature.attr[attribute_label]
    return output
コード例 #28
0
def get_gene_model(gtf=None, shift=[2000, 2000]):
    bgmodel = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    regions = get_TSS_TES(gtf)
    regions = shift_TSS_TES(regions, shift=shift)
    for key, iv in regions.items():
        if iv.start > iv.end:
            # here the model is nonstranded
            iv.start, iv.end = iv.end, iv.start
        bgmodel[iv] += key
    return bgmodel
コード例 #29
0
ファイル: call_peak.py プロジェクト: noahpieta/clipper
def read_array(reads, start, stop):
    reads = (read for read in reads
             if (read.iv.start_d > start) & (read.iv.end_d < stop))
    set_of_reads = HTSeq.GenomicArrayOfSets("auto", stranded=True)
    for read in reads:
        if read.aligned:
            for cigop in read.cigar:
                if cigop.type != "M":
                    continue
                set_of_reads[cigop.ref_iv] += read
    return set_of_reads
コード例 #30
0
def load_exons_from_gtf(gtf_fn, stranded=True):
    gtf_file = HTSeq.GFF_Reader(gtf_fn, end_included=True)
    exons = HTSeq.GenomicArrayOfSets("auto", stranded=stranded)
    for feat in gtf_file:
        if feat.type == 'exon' and feat.iv.chrom in CHROMS:
            if feat.iv.chrom == 'MT':
                feat.iv.chrom = 'chrM'
            else:
                feat.iv.chrom = 'chr' + feat.iv.chrom
            exons[feat.iv] += feat.name
    return exons