def get_gene_features(gtfFile, id_type, feature_type): ''' get exon features and gene interval features ''' features = HTSeq.GenomicArrayOfSets("auto", stranded="yes") geneFeatures = HTSeq.GenomicArrayOfSets("auto", stranded="yes") geneRange = {} gtf = HTSeq.GFF_Reader(gtfFile) i = 0 for line in gtf: if line.type == feature_type: feature_id = line.attr[id_type] features[line.iv] += feature_id if feature_id not in geneRange: geneRange[feature_id] = [line.iv.chrom, 0, 0, line.iv.strand] if geneRange[feature_id][1] != 0: geneRange[feature_id][1] = min(geneRange[feature_id][1], line.iv.start) else: geneRange[feature_id][1] = line.iv.start geneRange[feature_id][2] = max(geneRange[feature_id][2], line.iv.end) i += 1 if i % 100000 == 0: print("%d GFF lines processed.\n" % i, file=sys.stderr) for g, v in geneRange.items(): chrom, start, end, strand = v tmp_iv = HTSeq.GenomicInterval(chrom, start, end, strand) geneFeatures[tmp_iv] += g return features, geneFeatures
def init_GenomicArrayOfSets_and_Counter_for_quant_IRC(self): genes = HTSeq.GenomicArrayOfSets("auto", stranded=self.stranded) gene_region = HTSeq.GenomicArrayOfSets("auto", stranded=self.stranded) CER_region = HTSeq.GenomicArrayOfSets("auto", stranded=self.stranded) gene_counts = collections.defaultdict( lambda: collections.Counter()) CIR_counts = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.Counter() )) CJ_counts = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.Counter() )) for feature in self.gtffile: gene_id = feature.attr["gene_id"] if feature.type == "gene_region": gene_region[feature.iv] += gene_id elif feature.type == "constitutive_exonic_region" and gene_id in self.valid_genes: CER_number = feature.attr["constitutive_exonic_region_number"] genes[feature.iv] += (gene_id, feature.type, CER_number) CER_region[feature.iv] += "constitutive_exonic_region" elif feature.type == "constitutive_intronic_region" and gene_id in self.valid_genes: CIR_number = feature.attr["constitutive_intronic_region_number"] genes[feature.iv] += (gene_id, feature.type, CIR_number) if self.CIR_has_both_upstream_and_downstream_CERs(feature): CIR_counts[gene_id][CIR_number]["CIR_5'retained_reads"] = 0 CIR_counts[gene_id][CIR_number]["CIR_3'retained_reads"] = 0 CIR_counts[gene_id][CIR_number]["CIR_spliced_reads"] = 0 elif feature.type == "constitutive_junction" and gene_id in self.valid_genes: CJ_number = feature.attr["constitutive_junction_number"] CJ_counts[gene_id][CJ_number]["CJ_retained_reads"] = 0 CJ_counts[gene_id][CJ_number]["CJ_spliced_reads"] = 0 return genes, gene_region, CER_region, gene_counts, CIR_counts, CJ_counts
def loops2degreesBroad(fin, fout): if os.path.isfile(fout): return model = HTSeq.GenomicArrayOfSets("auto", stranded=0) model2 = HTSeq.GenomicArrayOfSets("auto", stranded=0) for i, line in enumerate(open(fin)): line = line.split("\n")[0].split("\t") iva = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2])) ivb = HTSeq.GenomicInterval(line[3], int(line[4]), int(line[5])) try: model[iva] += 1 model[ivb] += 1 model2[iva] += line[6] + "-left" model2[ivb] += line[6] + "-right" except: print fin, line with open(fout, "w") as fo: for iv, value in list(model.steps()): if value == set([]): continue ds = set() for ivb, valueb in model2[iv].steps(): ds.update(valueb) ds = list(ds) line = [iv.chrom, iv.start, iv.end, len(ds), ",".join(ds)] fo.write("\t".join(map(str, line)) + "\n")
def readTargetFeatures(interval, Branch_windows, Branch_to3ss): intron_set = set() fiveSS = {} threeSS = {} targets = HTSeq.GenomicArrayOfSets('auto', stranded=True) #strands are switched here. MPE-seq reads are on the opposite strand for line in open(interval): fields = line.rstrip().split('\t') if fields[5] == '+': fields[5] = '-' else: fields[5] = '+' #2 is subtracted and 1 is added such that a read must go atleast 3bp into the intron to be called unspliced if fields[5] == '-': iv = HTSeq.GenomicInterval(fields[0], int(fields[1]) - 1, int(fields[2]) - 2, fields[5]) else: iv = HTSeq.GenomicInterval(fields[0], int(fields[1]) + 1, int(fields[2]), fields[5]) targets[iv] += fields[3] intron_set.add(fields[3]) #5'SS and 3'SS are swithed here because MPE-seq reads are on the opposite strand and the orientation was switched above if fields[5] == '+': fiveSS[(fields[0], int(fields[2]), fields[5])] = tuple(fields[3].split(';')) threeSS[(fields[0], int(fields[1]), fields[5])] = tuple(fields[3].split(';')) else: fiveSS[(fields[0], int(fields[1]), fields[5])] = tuple(fields[3].split(';')) threeSS[(fields[0], int(fields[2]), fields[5])] = tuple(fields[3].split(';')) Branches = HTSeq.GenomicArrayOfSets("auto", stranded=False) for line in open(Branch_windows): fields = line.rstrip().split('\t') iv = HTSeq.GenomicInterval(fields[1], int(fields[2]), int(fields[3])) Branches[iv] += fields[0] Branchto3ss = HTSeq.GenomicArrayOfSets("auto", stranded=False) for line in open(Branch_to3ss): fields = line.rstrip().split('\t') iv = HTSeq.GenomicInterval(fields[1], int(fields[2]), int(fields[3])) Branchto3ss[iv] += fields[0] return targets, intron_set, fiveSS, threeSS, Branches, Branchto3ss
def get_protein_coding_gtf(): gtf_file = HTSeq.GFF_Reader(PATH_ANNOT + "gencodeVM13/gencode.vM13.annotation.transcript.gtf", end_included=True) windows = HTSeq.GenomicArrayOfSets("auto", stranded=True) with open(PATH_ANNOT + "gencode.vM9.annotation.protein_coding.gtf", "w") as slidingGTF: for feature in gtf_file: if feature.attr['gene_type'] == "protein_coding": slidingGTF.write(feature.get_gff_line())
def cluster_genes(genes, chrom_list): """cleans overlapping regions, all partially or completely overlapping genes are clustered into a single gene """ genes2 = HTSeq.GenomicArrayOfSets(chrom_list, stranded=False) region = genes.steps() last = set() num = 0 FLAG = False iv0 = HTSeq.GenomicInterval("chr1", 0, 1) for iv, gene in region: if len(gene) == 0: if FLAG == False: last = set([]) num = 0 else: genes2[iv0] = last last = set() num = 0 else: FLAG = True last = set.union(last, gene) num += 1 if num > 1: iv0.extend_to_include(iv) else: iv0 = iv return genes2
def get_overlapping_gene_names(reference, is_stranded): '''is_stranded: Whether a transcript on the opposite strand should be considered overlapping. If is_stranded=True, only consider genes on same strand as overlapping. Returns: lists of names of genes that overlap another gene or do not overlap another gene (strand-specific or not as specified in input)''' all_gene_names = set() gene_gas = HTSeq.GenomicArrayOfSets(chroms='auto', stranded=is_stranded) for gene_name, gene in reference.genes.iteritems(): all_gene_names.add(gene_name) for transcript in gene.transcripts: if gene_name == 'ZEB2-AS1': if transcript.iv.strand == '-': #ZEB2 AS transcript strand was misannotated in iGenomes UCSC hg19. #print 'Switching ZEB2-AS1 transcript strand to + (misannotated!)' transcript.iv.strand = '+' #else: # print "ZEB2-AS1 is on '+' strand, %r" % transcript gene_gas[transcript.iv] += gene_name overlapping_genes = set() for gene_name, gene in reference.genes.iteritems(): for transcript in gene.transcripts: for _, genes_in_region in gene_gas[transcript.iv].steps(): if len(genes_in_region) > 1: overlapping_genes.update(genes_in_region) non_overlapping_genes = all_gene_names - overlapping_genes return overlapping_genes, non_overlapping_genes
def get_features(gff_filename, stranded, feature_type, id_attribute, quiet): features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" ) counts = {} gff = HTSeq.GFF_Reader( gff_filename ) i = 0 try: for f in gff: if f.type == feature_type: try: feature_id = f.attr[ id_attribute ] except KeyError: raise ValueError, ( "Feature %s does not contain a '%s' attribute" % ( f.name, id_attribute ) ) if stranded != "no" and f.iv.strand == ".": raise ValueError, ( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % ( f.name, f.iv ) ) features[ f.iv ] += feature_id counts[ f.attr[ id_attribute ] ] = 0 i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) except: sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() ) raise if not quiet: sys.stderr.write( "%d GFF lines processed.\n" % i ) if len( counts ) == 0: raise Exception( "No features of type '%s' found.\n" % feature_type ) return (features, counts)
def htseq(): gff_file = HTSeq.GFF_Reader(gff, end_included=True) genes = HTSeq.GenomicArrayOfSets("auto", stranded=True) for gene in gff_file: if gene.type == "gene": genes[gene.iv] += gene.attr["ID"] tasks = [] results = [] out_q = Queue() for tp in range(1, num_tp + 1): for rep in range(1, num_reps + 1): task = Process(target=htseq_count, args=(genes, rep, tp, out_q)) tasks.append(task) task.start() for i in range(len(tasks)): results.append(out_q.get()) for task in tasks: task.join() counts = range(len(results)) for result in results: counts[result[1]] = result[0] keys_a = set(counts[0].keys()) for sample in counts: keys_b = set(sample.keys()) keys_a = keys_a & keys_b keys = list(keys_a) matrix = np.zeros(shape=(len(keys), len(counts)), dtype=np.int_) for key_index in range(len(keys)): for index in range(len(counts)): matrix[key_index, index] = counts[index][keys[key_index]] np.savetxt(htseq_out, matrix, delimiter=",")
def get_gtf(gtf_file=None): gtf = HTSeq.GFF_Reader(gtf_file) features = HTSeq.GenomicArrayOfSets('auto', stranded=True) for feature in gtf: if feature.type == 'exon': features[feature.iv] += feature.attr['gene_id'] return features
def calculateFPKM(bam, gtf, readsNum=1, geneType="tRNA"): import HTSeq gtf = HTSeq.GFF_Reader(gtf, end_included=True) bam = HTSeq.BAM_Reader(bam) genes = HTSeq.GenomicArrayOfSets("auto", stranded=False) counts = {} lengthOfReads = {} for feature in gtf: if "gene_biotype" in feature.attr.keys(): if feature.type == "gene" and feature.attr[ "gene_biotype"] == geneType: genes[feature.iv] += feature.name counts[feature.name] = 0 lengthOfReads[feature.name] = feature.iv.end - feature.iv.start for aln in bam: if aln.aligned: iset = None for iv2, step_set in genes[aln.iv].steps(): if iset is None: iset = step_set.copy() else: iset.intersection_update(step_set) if len(iset) == 1: counts[list(iset)[0]] += 1 rpkm = {} for key in counts.keys(): #rpkm reads per million per kilobase rpkm[key] = counts[key] * 1000 / (lengthOfReads[key] * readsNum) return (rpkm)
def getCov(f, paired=True): logger.info("Building coverage model for %s, paired=%s" % (f, paired)) model = HTSeq.GenomicArrayOfSets("auto", stranded=False) i = None uniqs = set() for i, line in enumerate(gzip.open(f)): if i % 10000 == 0: cFlush("%s read from %s" % (i, f)) line = line.split("\n")[0].split("\t") if paired and line[0] != line[3]: continue if paired: s = min(int(line[1]), int(line[4])) e = max(int(line[2]), int(line[5])) else: s = int(line[1]) e = int(line[2]) r = (line[0], s, e) if r not in uniqs: iv = HTSeq.GenomicInterval(line[0], s, e) model[iv] += str(i) uniqs.add(r) if i is None: logger.error("ERROR! No read in %s." % f) return 0, None logger.info("%s read from %s, unique %s" % (i, f, len(uniqs))) return len(uniqs), model
def load_bedfile_to_ga(bed_file): ga = HTSeq.GenomicArrayOfSets("auto", stranded=True) with open(bed_file, "r") as fh: for line in fh: row = line.strip().split("\t") # field values based on the bed files from DBTSS. BED files are 0 based, no adjusting necessary. try: chrom = row[0] pos = int(row[1]) strand = row[5] score = float(row[4]) except IndexError as e: print(e) print(row) continue try: ga[HTSeq.GenomicInterval(chrom, pos - 1, pos, strand)] = score except ValueError as e: print("Error loading GA:") print(row) continue return ga
def __init__(self,name): self.n = 0 self.name = name # list(db.region(region=('2L', 9277, 10000), completely_within=True)) #self.gas2 = gffutils.create_db(gtf, dbfn=db_file) self.gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)
def parse(self): idx = HTSeq.GenomicArrayOfSets("auto", stranded=False) if self.gtf_file: # could be None of no gtf file is provided log.info("Loading " + self.gtf_file) gtf_file = HTSeq.GFF_Reader(self.gtf_file, end_included=True) n = 0 for feature in gtf_file: if feature.type == "gene": if 'gene_name' in feature.attr: name = feature.attr['gene_name'] elif 'Name' in feature.attr: name = feature.attr['Name'] elif 'gene' in feature.attr: name = feature.attr['gene'] else: name = feature.name if feature.iv.chrom[0:3] == 'chr': feature.iv.chrom = feature.iv.chrom[3:] idx[feature.iv] += name n += 1 log.info("Loaded " + str(n) + " features") return idx
def readGTF(gtfFile): gtf = HTSeq.GFF_Reader(gtfFile) start_codon_sites = {} stop_codon_sites = {} CDS_features = HTSeq.GenomicArrayOfSets("auto", stranded="no") i = 0 for f in gtf: i += 1 if i % 10000 == 0: sys.stderr.write("%d GFF lines processed.\r" % i) gname = f.attr['gene_id'] if f.type == "CDS": CDS_features[f.iv] += gname if f.type == "start_codon": if gname not in start_codon_sites: start_codon_sites[gname] = f.iv.start_d else: if f.iv.strand == "+": start_codon_sites[gname] = min(f.iv.start, start_codon_sites[gname]) else: start_codon_sites[gname] = max(f.iv.start_d, start_codon_sites[gname]) if f.type == "stop_codon": if gname not in stop_codon_sites: stop_codon_sites[gname] = f.iv.end_d else: if f.iv.strand == "+": stop_codon_sites[gname] = max(f.iv.end, stop_codon_sites[gname]) else: stop_codon_sites[gname] = min(f.iv.end_d, stop_codon_sites[gname]) return start_codon_sites, stop_codon_sites, CDS_features
def main(arglist): probe_csv = snakemake.input['probe_csv'] kmer_homology_files = snakemake.input['kmer_homology_files'] target_homology_files = snakemake.input['target_homology_files'] filtered_probe_csv = snakemake.output['filtered_probe_csv'] alnmts = [] for i in range(0, len(kmer_homology_files)): target_blast_df = pd.read_csv(target_homology_files[i]) kmer_blast_df = pd.read_csv(kmer_homology_files[i]) regions = [ create_genomic_interval(x, y, z) for x, y, z in zip( target_blast_df['sstart'], target_blast_df['send'], target_blast_df['sseqid']) ] rRNA_genes = HTSeq.GenomicArrayOfSets("auto", stranded=True) for r in regions: #only count if on the same strand as the rRNA: if r.strand == '+': rRNA_genes[r] += 'rRNA' filt_df = overlapper(kmer_blast_df, rRNA_genes, discard_minus_strand=True, mode='discard') alnmts.append(set(filt_df['qseqid'].tolist())) bad_kmers = set.union(*alnmts) df = pd.read_csv(probe_csv, index_col='unique_id') df['passed_homology_screen'] = ~df.index.isin(bad_kmers) df[df['passed_homology_screen']].to_csv(filtered_probe_csv)
def cook_anno_model(gff_fpath, feature_atrr='gene_id', feature_type='exon', gene_types=(), stranded=True, dumpto=None, verbose=False): ''' Prepare a feature model. Output: (features, exported_genes) where: - features: HTSeq.GenomicArrayOfSets() - exported_genes: a sorted list For example, feature_atrr = 'gene_name', feature_type = 'exon', gene_types = ('protein_coding', 'lincRNA'): - features: all exons ~ all gnames mapping and ready for counting - exported_genes: only protein_coding and lincRNA gnames are visible Quantification used the full genes but only the selected genes are reported. ''' features = HTSeq.GenomicArrayOfSets("auto", stranded=stranded) fh_gff = HTSeq.GFF_Reader(gff_fpath) exported_genes = set() i = 0 for gff in fh_gff: if verbose and i % 100000 == 0: print_logger('Processing {:,} lines of GFF...'.format(i)) i += 1 if gff.type != feature_type: continue features[gff.iv] += gff.attr[feature_atrr].strip() if not feature_atrr.startswith('gene'): exported_genes.add(gff.attr[feature_atrr].strip()) continue if not gene_types: exported_genes.add(gff.attr[feature_atrr].strip()) continue if gff.attr.get('gene_biotype', None) in gene_types: exported_genes.add(gff.attr[feature_atrr].strip()) print_logger('Processed {:,} lines of GFF...'.format(i)) # Use genometools to select exported_genes # if gene_types: # exported_genes = get_genes(gff_fpath, valid_biotypes=set(gene_types)) # exported_genes = list(exported_genes['name'].values) if exported_genes: exported_genes = tuple(sorted(exported_genes)) if dumpto: with open(dumpto, 'wb') as fh: pickle.dump((features, exported_genes), fh) return ((features, exported_genes))
def Get_label_information(label, annot, bam_reader): warnings.simplefilter("ignore") gas = HTSeq.GenomicArrayOfSets("auto", stranded=False) ga = HTSeq.GenomicArray("auto", stranded=False, typecode="i") gene_count = {} for feature, rank, chrom, start, end, strand, length, exon_rank_left, exon_rank_right in annot[ label]: iv = HTSeq.GenomicInterval(chrom, start, end, strand) gas[iv] += (feature, rank) gene_count[(feature, rank)] = 0 boundary_left, boundary_right = min([i[3] for i in annot[label] ]), max([i[4] for i in annot[label]]) region_fetch = annot[label][0][2] + ":" + str( int(boundary_left) - 500) + "-" + str(int(boundary_right) + 500) read_seq = bam_reader.fetch(region=region_fetch) read_seq_iter = iter(bam_reader.fetch()) one_read = next(read_seq_iter) pe_mode = one_read.paired_end if pe_mode: read_seq = HTSeq.pair_SAM_alignments_with_buffer(read_seq) for a in read_seq: if not pe_mode: if not a.aligned: continue if a.optional_field('NH') > 1: continue iv_seq = (cigop.ref_iv for cigop in a.cigar if cigop.type == "M" and cigop.size > 0) else: if ((a[0] and a[0].aQual < minaqual) or (a[1] and a[1].aQual < minaqual)): continue if ((a[0] and a[0].optional_field('NH') > 1) or (a[1] and a[1].optional_field('NH') > 1)): continue if a[0] is not None and a[0].aligned: iv_seq = (cigop.ref_iv for cigop in a[0].cigar if cigop.type in cigar_char and cigop.size > 0) else: iv_seq = tuple() if a[1] is not None and a[1].aligned: iv_seq = itertools.chain( iv_seq, (invert_strand(cigop.ref_iv) for cigop in a[1].cigar if cigop.type in cigar_char and cigop.size > 0)) feature_aligned = set() for iv in iv_seq: for iv2, val2 in gas[iv].steps(): feature_aligned |= val2 ga[iv] += 1 # for calculating coverage if len(feature_aligned) == 0: continue for f in [item for item in feature_aligned if item[0] == 'intron']: gene_count[f] += 1 if 'intron' not in [x for x, y in feature_aligned]: for f in feature_aligned: gene_count[f] += 1 return gas, ga, gene_count
def _get_features_file(GTF_filename, stranded_info=False): exon_annotation_features = HTSeq.GenomicArrayOfSets('auto', stranded=stranded_info) gtf_annotation = HTSeq.GFF_Reader(GTF_filename) for feature in gtf_annotation: if feature.type == 'exon': exon_annotation_features[feature.iv] += feature.attr['gene_id'] return exon_annotation_features
def readGenomeAnnotation(self, args): self.genomeAnnotation = HTSeq.GFF_Reader( args.gff, end_included=True ) self.features = HTSeq.GenomicArrayOfSets("auto", stranded=True) for feature in self.genomeAnnotation: if feature.type == "gene": self.features[feature.iv] += self.getFeatureID(feature)
def read_gff( gff_filename, feature_type=['CDS'], id_attribute='Parent', additional_attributes=[], quiet=0, head=-1, stranded='yes', ): '''Adapter from HTSeq-count ''' if isinstance(feature_type, basestring): feature_type = [feature_type] # gff_filename = FNAME features = HTSeq.GenomicArrayOfSets("auto", stranded != "no") gff = HTSeq.GFF_Reader(gff_filename) counts = {} attributes = {} i = 0 ids = collections.OrderedDict() try: for f in gff: if f.type in feature_type: try: feature_id = f.attr[id_attribute] except KeyError: raise ValueError( "Feature %s does not contain a '%s' attribute" % (f.name, id_attribute)) if stranded != "no" and f.iv.strand == ".": raise ValueError( "Feature %s at %s does not have strand information but you are " "running htseq-count in stranded mode. Use '--stranded=no'." % (f.name, f.iv)) features[f.iv] += feature_id counts[f.attr[id_attribute]] = 0 attributes[f.attr[id_attribute]] = [ f.attr[attr] if attr in f.attr else '' for attr in additional_attributes ] ids.setdefault(feature_id, []) ids[feature_id] += [f.iv] i += 1 if i % 100000 == 0 and not quiet: sys.stderr.write("%d GFF lines processed.\n" % i) sys.stderr.flush() if head >= 0: if len(ids) == head: break except: sys.stderr.write("Error occured when processing GFF file (%s):\n" % gff.get_line_number_string()) raise return features, ids
def getGeneModel(): genes = getGenes() model = HTSeq.GenomicArrayOfSets("auto", stranded=False) for g in genes.keys(): iv = HTSeq.GenomicInterval(genes[g]["chr"], genes[g]["start"], genes[g]["end"], strand=genes[g]["strand"]) model[iv] += g return model, genes
def __init__(self, gtf_file, attribute_name): lg.debug('Using HTSeq for annotation.') self.loci = OrderedDict() self.features = HTSeq.GenomicArrayOfSets("auto", stranded=False) for f in HTSeq.GFF_Reader(gtf_file, end_included=True): if f.type == 'exon': self.features[f.iv] += f.attr[attribute_name] if f.attr[attribute_name] not in self.loci: self.loci[f.attr[attribute_name]] = list() self.loci[f.attr[attribute_name]].append(f)
def buildCovModel(readF): """ readF: bed.gz """ model = HTSeq.GenomicArrayOfSets("auto", stranded=False) for i, line in tqdm(enumerate(gzip.open(readF, 'rt'))): line = line.split("\n")[0].split("\t") iv = HTSeq.GenomicInterval(line[0], int(line[1]), int(line[2])) model[iv] = i return model, i
def find_gene_region_length(gene_region, transcript_region): gene_region_length = collections.Counter() for gene_id in transcript_region.keys(): transcripts = HTSeq.GenomicArrayOfSets("auto", stranded=True) for transcript_id in transcript_region[gene_id].keys(): transcripts[transcript_region[gene_id][transcript_id]] += gene_id for iv, step_set in transcripts[gene_region[gene_id]].steps(): if len(step_set) != 0: gene_region_length[gene_id] += iv.length return gene_region_length
def extract_GTF_features(file_path, feature_type='exon', attribute_label='gene_id'): gtf_file = HTSeq.GFF_Reader(file_path) output = HTSeq.GenomicArrayOfSets("auto", stranded=True) for feature in gtf_file: if feature.type == feature_type: output[feature.iv] += feature.attr[attribute_label] return output
def get_gene_model(gtf=None, shift=[2000, 2000]): bgmodel = HTSeq.GenomicArrayOfSets("auto", stranded=False) regions = get_TSS_TES(gtf) regions = shift_TSS_TES(regions, shift=shift) for key, iv in regions.items(): if iv.start > iv.end: # here the model is nonstranded iv.start, iv.end = iv.end, iv.start bgmodel[iv] += key return bgmodel
def read_array(reads, start, stop): reads = (read for read in reads if (read.iv.start_d > start) & (read.iv.end_d < stop)) set_of_reads = HTSeq.GenomicArrayOfSets("auto", stranded=True) for read in reads: if read.aligned: for cigop in read.cigar: if cigop.type != "M": continue set_of_reads[cigop.ref_iv] += read return set_of_reads
def load_exons_from_gtf(gtf_fn, stranded=True): gtf_file = HTSeq.GFF_Reader(gtf_fn, end_included=True) exons = HTSeq.GenomicArrayOfSets("auto", stranded=stranded) for feat in gtf_file: if feat.type == 'exon' and feat.iv.chrom in CHROMS: if feat.iv.chrom == 'MT': feat.iv.chrom = 'chrM' else: feat.iv.chrom = 'chr' + feat.iv.chrom exons[feat.iv] += feat.name return exons