def main(argv): bedfile = argv[1] wigfile = argv[2] intersecter = IntervalTree() for peak in parse_wig(wigfile): intersecter.insert_interval(peak) report(intersecter, bedfile)
def setUp(self): iv = IntervalTree() n = 0 for i in range(1, 1000, 80): iv.insert(i, i + 10, dict(value=i * i)) # add is synonym for insert. iv.add(i + 20, i + 30, dict(astr=str(i * i))) # or insert/add an interval object with start, end attrs. iv.insert_interval( Interval(i + 40, i + 50, value=dict(astr=str(i * i)))) iv.add_interval( Interval(i + 60, i + 70, value=dict(astr=str(i * i)))) n += 4 self.intervals = self.iv = iv self.nintervals = n
def setUp(self): iv = IntervalTree() n = 0 for i in range(1, 1000, 80): iv.insert(i, i + 10, dict(value=i*i)) # add is synonym for insert. iv.add(i + 20, i + 30, dict(astr=str(i*i))) # or insert/add an interval object with start, end attrs. iv.insert_interval(Interval(i + 40, i + 50, value=dict(astr=str(i*i)))) iv.add_interval(Interval(i + 60, i + 70, value=dict(astr=str(i*i)))) n += 4 self.intervals = self.iv = iv self.nintervals = n
def parse_blast(blast_str, qfeat): """takes a blast file and cns_pair and sees if the query cns intersects with any of the cns found""" scns_inteval = IntervalTree() for line in blast_str.split("\n"): if "WARNING" in line: continue if "ERROR" in line: continue line = line.split("\t") locus = map(int, line[6:10]) locus.extend(map(float, line[10:])) s_start, s_end = locus[:2] s_start = min(int(s_start), int(s_end)) s_end = max(int(s_start), int(s_end)) scns_inteval.insert_interval(Interval(s_start, s_end)) q_start = min(int(qfeat['start']), int(qfeat['end'])) q_end = max(int(qfeat['start']), int(qfeat['end'])) intersecting_cns = scns_inteval.find(q_start, q_end) return intersecting_cns
def _process_piece(filename_vci, contig, reverse): """ :param contig: :param reverse: :return: """ _chrom = 0 _pos = 1 _shared = 2 _deleted = 3 if not reverse else 4 _inserted = 4 if not reverse else 3 _fragment = 5 ret = {'contig':contig}#, 'tree':None} LOG.info("Chromosome: {}".format(contig)) line_no = 0 global global_mapping_tree tree = IntervalTree() try: if line_no % 10000 == 0: print("CONTIG {} {}".format(contig, line_no)) pos_from = 0 pos_to = 0 tabix_file = pysam.TabixFile(filename_vci) iterator = None try: iterator = tabix_file.fetch(contig, parser=pysam.asTuple()) except: LOG.debug("Exception for {}".format(contig)) if iterator is None: LOG.debug("No iterator") return ret for rec in iterator: if len(rec) != 6: raise exceptions.G2GError("Unexpected line in G2G file. Line #{0:,}: {1}".format(line_no, rec)) if rec[2] == '.': continue """ 1 3000019 G . A 3000019 1 3003170 A . T 3151 1 3003197 A . G 27 1 3003640 CG GGGG . 444 1 3006790 G AA . 3145 1 3006834 G A . 42 1 3007272 GC C . 438 1 3008489 T . ATC 1215 """ #LOG.debug(rec) fragment = int(rec[_fragment]) deleted_bases = 0 if rec[_deleted] == '.' else len(rec[_deleted]) inserted_bases = 0 if rec[_inserted] == '.' else len(rec[_inserted]) #LOG.debug("pos_from={}, pos_to={}".format(pos_from, pos_to)) #LOG.debug("Inserting interval {} - {}".format(pos_from, pos_from + fragment)) interval = Interval(pos_from, pos_from + fragment, IntervalInfo(contig, pos_to, pos_to + fragment, rec[_shared], rec[_deleted], rec[_inserted], rec[_pos])) #LOG.debug(interval) tree.insert_interval(interval) pos_from += (fragment + deleted_bases) pos_to += (fragment + inserted_bases) line_no += 1 except KeyboardInterrupt: raise exceptions.KeyboardInterruptError() except Exception as e: g2g_utils._show_error() LOG.error("OOOOOOOPS") #ret['tree'] = tree #print 'traversing tree...' #tree.traverse(d) Global = multiprocessing.Manager().Namespace() tt = Global.tree tt[contig] = tree Global.tree = tt LOG.debug("PROCESSED {0:,} lines for {1}".format(line_no, contig)) return ret
def annotate_igrs(genome, igr_df): """ Annotate the inter-genic regions listed in a dataframe with any available annotations from Rfam Parameters ---------- genome: src.data.rfam_db.Genome The genome object for the organism who's IGR's are being analyzed igr_df: pandas.Dataframe The dataframe with the columns 'accession', 'start', 'end', 'length', 'gc' Returns ------- annotated_igr_df: pandas.Dataframe """ # Initialize connection to Rfam database session = rfam_session() # Get the list of "rfamseq_acc" numbers for a given organism rfamseq_acc_list = session.query(t_genseq.c.rfamseq_acc).filter( t_genseq.c.upid == genome.upid).distinct().all() # Create a list to store all the interval trees annotation_tree_dict = {} for rfamseq_acc in rfamseq_acc_list: # Pull rfamseq_acc out of the list rfamseq_acc = rfamseq_acc[0] rna_query = session.query(t_full_region).filter( t_full_region.c.rfamseq_acc == rfamseq_acc) rna_list = rna_query.all() # Make an interval tree for all of the RNA annotations to allow for rapid overlap search annotation_tree = IntervalTree() # Go though and add each RNA annotation to the interval tree for rna in rna_list: start = min(rna.seq_start, rna.seq_end) end = max(rna.seq_start, rna.seq_end) annotation_interval = Interval(start=start, end=end, chrom=rna.rfamseq_acc, value=rna) annotation_tree.insert_interval(annotation_interval) rfamseq_acc_stripped = rfamseq_acc.partition('.')[0] annotation_tree_dict[rfamseq_acc_stripped] = annotation_tree # Make an empty list of all the igrs with annotations annotated_igr_list = [] for accession, accession_igr_df in igr_df.groupby('accession'): # Lookup the RNA annotation tree for the given accession try: annotation_tree = annotation_tree_dict[accession] except KeyError: print("IGR dataframe key: {} not found. Available keys are: {}". format(accession, annotation_tree_dict.keys())) # For each IGR find all of the overlaps with annotated RNAs for igr in accession_igr_df.itertuples(): overlap_list = annotation_tree.find(igr.start, igr.end) for overlap in overlap_list: # Add the IGR to the annotated_igr_list annotated_igr_list.append({ 'igr_index': igr[0], 'rfam_acc': overlap.value.rfam_acc }) # Convert annotated_igr_list into dataframe and merge on the rfam_acc annotated_igr_df = pd.merge(igr_df, pd.DataFrame(annotated_igr_list, columns=["igr_index", "rfam_acc"]), on="igr_index", how='left') # Look up the information for all of the RNA families represented in this genome rna_family_query = session.query(Family)\ .with_entities(Family.rfam_acc, Family.rfam_id, Family.description, Family.type)\ .filter(Family.rfam_acc.in_(annotated_igr_df["rfam_acc"].dropna().unique())) rna_families_df = pd.read_sql(rna_family_query.statement, rna_family_query.session.bind) merged_igr_df = pd.merge(annotated_igr_df, rna_families_df, on="rfam_acc", how="left") combined_descriptions = merged_igr_df.dropna().groupby("igr_index")\ .agg(dict(rfam_acc=lambda x: ','.join(set(x)), rfam_id=lambda x: ','.join(set(x)), type=lambda x: ','.join(set(x)), description=lambda x: '<br>'.join(set(x)))) merged_igr_df.drop_duplicates(["igr_index"], inplace=True) merged_igr_df.reset_index(inplace=True, drop=True) merged_igr_df.update(combined_descriptions) merged_igr_df["category"] = merged_igr_df.apply( lambda row: categorize_igr(row), axis=1) merged_igr_df["log_length"] = np.log(merged_igr_df["length"]) session.close() return merged_igr_df
def process_piece(args): filename = args['filename'] contig = args['contig'] reverse = args['reverse'] try: _chrom = 0 _pos = 1 _shared = 2 _deleted = 3 if not reverse else 4 _inserted = 4 if not reverse else 3 _fragment = 5 num_lines_chrom = 0 num_lines_processed = 0 pos_from = 0 pos_to = 0 tree = IntervalTree() tabix_file = pysam.TabixFile(filename) iterator = tabix_file.fetch(contig, parser=pysam.asTuple()) LOG.info("Parsing VCI, contig: {}".format(contig)) for rec in iterator: num_lines_chrom += 1 if len(rec) != 6: raise exceptions.G2GError( "Unexpected line in VCI file: {0}".format(rec)) if rec[2] == '.': continue """ 1 3000019 G . A 3000019 1 3003170 A . T 3151 1 3003197 A . G 27 1 3003640 CG GGGG . 444 1 3006790 G AA . 3145 1 3006834 G A . 42 1 3007272 GC C . 438 1 3008489 T . ATC 1215 """ #LOG.debug("||".join(rec)) fragment = int(rec[_fragment]) deleted_bases = 0 if rec[_deleted] == '.' else len(rec[_deleted]) inserted_bases = 0 if rec[_inserted] == '.' else len( rec[_inserted]) #LOG.debug("pos_from={}, pos_to={}".format(pos_from, pos_to)) #LOG.debug("Inserting interval {} - {}".format(pos_from, pos_from + fragment)) interval = Interval( pos_from, pos_from + fragment, IntervalInfo(contig, pos_to, pos_to + fragment, rec[_shared], rec[_deleted], rec[_inserted], rec[_pos])) #LOG.debug(interval) tree.insert_interval(interval) pos_from += (fragment + deleted_bases) pos_to += (fragment + inserted_bases) num_lines_processed += 1 #LOG.debug("Parsed {0:,} lines for contig {1} in {2}".format(num_lines_processed, contig, g2g_utils.format_time(cotig_start_time, time.time()))) except Exception as e: LOG.error(e) return {'tree': tree, 'contig': contig}