def is_before(gid_coords, fkey, fstrand): if fstrand == "-": return (utilities.overlap(gid_coords, fkey) >= 0) or gid_coords[1] < fkey[0] else: return (utilities.overlap(gid_coords, fkey) >= 0) or gid_coords[0] > fkey[1]
def __check_collisions(transcript, nspan, spans): """ This method checks whether a new transcript collides with a previously defined transcript. :param nspan: :param spans: :return: """ if len(spans) == 0: return for span in spans: overl = overlap(span, nspan) transcript.logger.debug( "Comparing start-ends for split of %s. SpanA: %s SpanB: %s Overlap: %d", transcript.id, span, nspan, overl) if overl > 0: err_message = "Invalid overlap for {0}! T1: {1}. T2: {2}".format( transcript.id, span, nspan) transcript.logger.error(err_message) raise InvalidTranscript(err_message)
def test_overlap(self): """ Test for overlap function :return: """ self.assertEqual(Abstractlocus.overlap((100, 200), (100, 200)), 100) self.assertEqual(Abstractlocus.overlap((100, 200), (100, 200)), overlap((100, 200), (100, 200)))
def test_noCDSOverlap(self): self.t1.strip_cds() self.assertEqual(self.t1.combined_cds_introns, set()) self.t1.finalized = False self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS") self.t1.finalize() t2 = Transcript() t2.logger = self.logger t2.chrom = "Chr1" t2.strand = "+" t2.score = 1 t2.id = "G2.1" t2.parent = "G2" t2.start = 101 t2.end = 1470 t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)]) t2.add_exons([(1201, 1350), (1421, 1450)], "CDS") t2.finalize() self.assertTrue(self.t1.is_coding) self.assertTrue(t2.is_coding) self.assertGreaterEqual( 0, overlap((self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)), [(self.t1.combined_cds_start, self.t1.combined_cds_end), (t2.combined_cds_start, t2.combined_cds_end)]) self.assertTrue( MonosublocusHolder.is_intersecting(self.t1, t2, logger=self.logger)) self.assertFalse( MonosublocusHolder.is_intersecting(self.t1, t2, cds_only=True, logger=self.logger))
def __load_blast_hits(new_transcript, boundary, transcript): """ Function to load the BLAST hits into the new splitted transcript. :param new_transcript: the splitted transcript :type new_transcript: Mikado.loci_objects.Transcript :param boundary: tuple(start, end) of the boundary of the new transcript :type boundary: tuple(int, int) :param transcript: the original transcript :type transcript: Mikado.loci_objects.Transcript :return: """ for hit in transcript.blast_hits: if overlap((hit["query_start"], hit["query_end"]), boundary) > 0: minimal_overlap = transcript.json_conf[ "pick"]["chimera_split"]["blast_params"]["minimal_hsp_overlap"] new_hit = __recalculate_hit(hit, boundary, minimal_overlap) if new_hit is not None: transcript.logger.debug("""Hit %s, previous id/query_al_length/t_al_length %f/%f/%f, novel %f/%f/%f""", new_hit["target"], hit["global_identity"], hit["query_aligned_length"], hit["target_aligned_length"], new_hit["global_identity"], new_hit["query_aligned_length"], new_hit["target_aligned_length"]) new_transcript.blast_hits.append(new_hit) else: transcript.logger.debug("Hit %s did not pass overlap checks for %s", hit["target"], new_transcript.id) else: transcript.logger.debug("Ignoring hit %s as it is not intersecting", hit) continue
def __load_blast_hits(new_transcript, boundary, transcript): """ Function to load the BLAST hits into the new splitted transcript. :param new_transcript: the splitted transcript :type new_transcript: Mikado.loci_objects.Transcript :param boundary: tuple(start, end) of the boundary of the new transcript :type boundary: tuple(int, int) :param transcript: the original transcript :type transcript: Mikado.loci_objects.Transcript :return: """ for hit in transcript.blast_hits: if overlap((hit["query_start"], hit["query_end"]), boundary) > 0: minimal_overlap = transcript.json_conf["pick"]["chimera_split"][ "blast_params"]["minimal_hsp_overlap"] new_hit = __recalculate_hit(hit, boundary, minimal_overlap) if new_hit is not None: transcript.logger.debug( """Hit %s, previous id/query_al_length/t_al_length %f/%f/%f, novel %f/%f/%f""", new_hit["target"], hit["global_identity"], hit["query_aligned_length"], hit["target_aligned_length"], new_hit["global_identity"], new_hit["query_aligned_length"], new_hit["target_aligned_length"]) new_transcript.blast_hits.append(new_hit) else: transcript.logger.debug( "Hit %s did not pass overlap checks for %s", hit["target"], new_transcript.id) else: transcript.logger.debug( "Ignoring hit %s as it is not intersecting", hit) continue
def __recalculate_hit(hit, boundary, minimal_overlap): """Static method to recalculate coverage/identity for new hits.""" __valid_matches = set([chr(x) for x in range(65, 91)] + [chr(x) for x in range(97, 123)] + ["|"]) hit_dict = dict() for key in iter(k for k in hit.keys() if k not in ("hsps",)): hit_dict[key] = hit[key] hsp_dict_list = [] # hit_dict["global_identity"] = [] q_intervals = [] t_intervals = [] identical_positions, positives = set(), set() best_hsp = (float("inf"), float("-inf")) for hsp in hit["hsps"]: _ = overlap((hsp["query_hsp_start"], hsp["query_hsp_end"]), boundary) if _ >= minimal_overlap * (boundary[1] + 1 - boundary[0]): hsp_dict_list.append(hsp) if hsp["hsp_evalue"] < best_hsp[0]: best_hsp = (hsp["hsp_evalue"], hsp["hsp_bits"]) q_intervals.append((hsp["query_hsp_start"], hsp["query_hsp_end"])) t_intervals.append((hsp["target_hsp_start"], hsp["target_hsp_end"])) query_pos = hsp["query_hsp_start"] - 1 for amino in hsp["match"]: if amino in __valid_matches or amino == "+": query_pos += 1 positives.add(query_pos) if amino != "+": identical_positions.add(query_pos) elif amino == "_": # Gap in the target sequence query_pos += 1 if len(hsp_dict_list) == 0: return None q_merged_intervals = sorted(merge(q_intervals), key=operator.itemgetter(0, 1)) q_aligned = sum([tup[1] - tup[0] + 1 for tup in q_merged_intervals]) hit_dict["query_aligned_length"] = q_aligned hit_dict["query_start"] = q_merged_intervals[0][0] hit_dict["query_end"] = q_merged_intervals[-1][1] t_merged_intervals = sorted(merge(t_intervals), key=operator.itemgetter(0, 1)) t_aligned = sum([tup[1] - tup[0] + 1 for tup in t_merged_intervals]) hit_dict["target_aligned_length"] = t_aligned hit_dict["target_start"] = t_merged_intervals[0][0] hit_dict["target_end"] = t_merged_intervals[-1][1] hit_dict["global_identity"] = len(identical_positions) * 100 / q_aligned hit_dict["global_positives"] = len(positives) * 100 / q_aligned hit_dict["hsps"] = hsp_dict_list hit_dict["bits"] = max(x["hsp_bits"] for x in hit_dict["hsps"]) hit_dict["evalue"] = min(x["hsp_evalue"] for x in hit_dict["hsps"]) return hit_dict
def check_split_by_blast(transcript, cds_boundaries): """ This method verifies if a transcript with multiple ORFs has support by BLAST to NOT split it into its different components. The minimal overlap between ORF and HSP is defined inside the JSON at the key ["chimera_split"]["blast_params"]["minimal_hsp_overlap"] basically, we consider a HSP a hit only if the overlap is over a certain threshold and the HSP evalue under a certain threshold. The split by CDS can be executed in three different ways - PERMISSIVE, LENIENT, STRINGENT: - PERMISSIVE: split if two CDSs do not have hits in common, even when one or both do not have a hit at all. - STRINGENT: split only if two CDSs have hits and none of those is in common between them. - LENIENT: split if *both* lack hits, OR *both* have hits and none of those is in common. :param transcript: the transcript instance :type transcript: Mikado.loci_objects.transcript.Transcript :param cds_boundaries: :return: cds_boundaries :rtype: dict """ # Establish the minimum overlap between an ORF and a BLAST hit to consider it # to establish belongingness minimal_overlap = transcript.json_conf[ "pick"]["chimera_split"]["blast_params"]["minimal_hsp_overlap"] cds_hit_dict = SortedDict().fromkeys(cds_boundaries.keys()) for key in cds_hit_dict: cds_hit_dict[key] = collections.defaultdict(list) # BUG, this is a hacky fix if not hasattr(transcript, "blast_hits"): transcript.logger.warning( "BLAST hits store lost for %s! Creating a mock one to avoid a crash", transcript.id) transcript.blast_hits = [] transcript.logger.debug("%s has %d possible hits", transcript.id, len(transcript.blast_hits)) # Determine for each CDS which are the hits available min_eval = transcript.json_conf["pick"]['chimera_split']['blast_params']['hsp_evalue'] for hit in transcript.blast_hits: for hsp in iter(_hsp for _hsp in hit["hsps"] if _hsp["hsp_evalue"] <= min_eval): for cds_run in cds_boundaries: # If I have a valid hit b/w the CDS region and the hit, # add the name to the set overlap_threshold = minimal_overlap * (cds_run[1] + 1 - cds_run[0]) overl = overlap(cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end'])) if overl >= overlap_threshold: cds_hit_dict[cds_run][(hit["target"], hit["target_length"])].append(hsp) transcript.logger.debug( "Overlap %s passed for %s between %s CDS and %s HSP (threshold %s)", overlap, transcript.id, cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end']), overlap_threshold) else: transcript.logger.debug( "Overlap %s rejected for %s between %s CDS and %s HSP (threshold %s)", overlap, transcript.id, cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end']), overlap_threshold) transcript.logger.debug("Final cds_hit_dict for %s: %s", transcript.id, cds_hit_dict) final_boundaries = SortedDict() for boundary in __get_boundaries_from_blast(transcript, cds_boundaries, cds_hit_dict): if len(boundary) == 1: assert len(boundary[0]) == 2 boundary = boundary[0] final_boundaries[boundary] = cds_boundaries[boundary] else: nboun = (boundary[0][0], boundary[-1][1]) final_boundaries[nboun] = [] for boun in boundary: final_boundaries[nboun].extend(cds_boundaries[boun]) transcript.logger.debug("Final boundaries for %s: %s", transcript.id, final_boundaries) cds_boundaries = final_boundaries.copy() return cds_boundaries
def create_transcript(tid: str, parent: str, lines: List[GtfLine], args: argparse.Namespace): """""" chroms = defaultdict(list) for line in lines: chroms[line.chrom].append(line) if len(chroms) > 1: # Recursively for chrom in chroms: newtid = tid + "." + chrom newparent = parent + "." + chrom for transcript in create_transcript(newtid, newparent, chroms[chrom], args): assert transcript.id == newtid, (newtid, transcript.id) assert transcript.parent[0] == newparent yield transcript else: # Now we are sure that we only have one chromosome exons = sorted([line for line in lines if line.is_exon], key=operator.attrgetter("chrom", "start", "end")) if len(exons) == 1: transcript = Transcript(exons[0]) transcript.id = tid transcript.parent = parent transcript.finalize() yield transcript else: new_exons = deque() identifier = ord("A") - 1 current = exons[0] for exon in exons[1:]: if ((overlap((exon.start, exon.end), (current.start, current.end)) > 0) or (exon.start - current.end + 1 <= args.min_intron and args.split is False)): # Merge the two exons current.end = exon.end elif ((exon.start - current.end + 1 <= args.min_intron and args.split is True) or exon.start - current.end + 1 > args.max_intron): # TODO: split new_exons.append(current) transcript = Transcript(new_exons.popleft()) transcript.add_exons(new_exons) transcript.finalize() identifier += 1 transcript.parent = parent + "." + chr(identifier) transcript.id = tid + "." + chr(identifier) yield transcript current = exon new_exons = deque() else: new_exons.append(current) current = exon new_exons.append(current) transcript = Transcript(new_exons.popleft()) transcript.add_exons(new_exons) if identifier == ord("A") - 1: transcript.id = tid transcript.parent = parent else: identifier += 1 transcript.id = tid + "." + chr(identifier) transcript.parent = parent + "." + chr(identifier) transcript.finalize() yield transcript
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("-o", "--out", type=str, default="promoters") parser.add_argument("-l", "--log", default=None) parser.add_argument("-lv", "--log-level", default="WARN", choices=["DEBUG", "INFO", "WARN", "ERROR", "CRITICAL"], dest="log_level") parser.add_argument("-d", "--distances", nargs="+", type=int, default=[1000, 2000, 5000]) parser.add_argument( "-nn", "--no-neighbours", dest="no_neighbours", action="store_true", default=False, help="Ignore the presence of neighbours when extracting genes.") parser.add_argument("-eu", "--exclude-utr", dest="exclude_utr", default=False, action="store_true") parser.add_argument("-z", "--gzip", default=False, action="store_true", help="Output will be compressed in GZip format.") parser.add_argument("genome") parser.add_argument("gff3") parser.add_argument("gene_list") args = parser.parse_args() logging.basicConfig( filename=args.log, format="{asctime} - {name} - {filename}:{lineno} - {levelname} - " "{funcName} - {processName} - {message}", style="{", level=args.log_level) logger = logging.getLogger('extract_promoter_regions') max_distance = max(args.distances) out_files = dict() args.distances = sorted([_ for _ in args.distances if _ > 0]) if not args.distances: exc = ValueError("I need at least one positive integer distance!") logger.exception(exc) sys.exit(1) for distance in args.distances: if args.gzip is True: out_files[distance] = gzip.open( "{}-{}bp.fasta.gz".format( os.path.splitext(args.out)[0], distance), "wt") else: out_files[distance] = open( "{}-{}bp.fasta".format( os.path.splitext(args.out)[0], distance), "wt") logger.info("Starting to load the genome") genome = pyfaidx.Fasta(args.genome) logger.info("Loaded the genome") logger.info("Starting to load the GFF3 index") with open(args.gff3) as gff3: namespace = argparse.Namespace namespace.reference = gff3 namespace.exclude_utr = args.exclude_utr namespace.protein_coding = False # Use Mikado compare functions to load the index from the GFF3 # "genes" is a dictionary of Gene objects, having as keys the gene names # "positions" is a dictionary of the form: [chrom][(start, end)] = [GID1, GID2, ...] genes, positions = load_index(namespace, logger) # Create a dictionary of interval trees, one per chromosome indexer = collections.defaultdict(list).fromkeys(positions) for chrom in indexer: indexer[chrom] = IntervalTree.from_tuples(positions[chrom].keys()) logger.info("Loaded the index") with open(args.gene_list) as gene_list: gids = [_.rstrip() for _ in gene_list] logger.info("Starting to extract sequences for {} genes".format( len(gids))) for gid in gids: if gid not in genes: exc = IndexError("{} not found in the index!".format(gid)) logger.exception(exc) continue chrom, start, end, strand = (genes[gid].chrom, genes[gid].start, genes[gid].end, genes[gid].strand) if chrom not in genome: exc = IndexError( "Chromosome {} not found in the genome!".format(chrom)) logger.exception(exc) continue # If the gene is on the minus strand, the promoter is further down if strand == "-": key = (start, min(end + max_distance, len(genome[chrom]))) else: # otherwise it is on the 5' side key = (max(0, start - max_distance), end) # Find all genes which are near if args.no_neighbours is False: neighbours = Assigner.find_neighbours(indexer.get( chrom, IntervalTree()), key, distance=0) # This is a list of the form [((start, end), distance), ...] where "(start, end)" is a key for the # "positions" dictionary, above # Find all the genes which are in the neighbourhood, remove the obvious case of the identity .. def is_before(gid_coords, fkey, fstrand): if fstrand == "-": return (utilities.overlap(gid_coords, fkey) >= 0) or gid_coords[1] < fkey[0] else: return (utilities.overlap(gid_coords, fkey) >= 0) or gid_coords[0] > fkey[1] neighbours = [ _[0] for _ in neighbours if is_before((start, end), _[0], strand) and gid not in positions[chrom][_[0]] ] else: neighbours = [] if not neighbours: # No neighbours found, we can grab everything for distance in args.distances: try: if strand == "-": chunk = (max(0, end), min(end + distance, len(genome[chrom]))) seq = genome[chrom][ chunk[0]:chunk[1]].reverse.complement.seq else: chunk = (max(0, start - 1 - distance), start - 1) seq = genome[chrom][chunk[0]:chunk[1]].seq seq = SeqRecord(Seq(seq), id="{}-prom-{}".format(gid, distance), description="{}{}:{}-{}".format( chrom, strand, chunk[0], chunk[1])) print(seq.format("fasta"), file=out_files[distance], end='') except ValueError as err: logger.error( "Error extracting the promoter for %s, distance %d. Error:\n%s", gid, distance, err) continue else: # We have some neighbours, we have to select the maximum distance we can go to logger.warning("{} neighbours found for {}: {}".format( len(neighbours), gid, neighbours)) if any([ utilities.overlap((start, end), _) >= 0 for _ in neighbours ]): logger.warning( "Overlapping genes found for {}. Skipping".format(gid)) continue for distance in args.distances: try: if strand == "-": max_point = min([_[0] for _ in neighbours]) if end + distance > max_point: continue chunk = (max(0, end), min( max_point, min(end + distance, len(genome[chrom])))) seq = genome[chrom][ chunk[0]:chunk[1]].reverse.complement.seq description = "{}{}:{}-{}".format( chrom, strand, chunk[1], chunk[0]) else: min_point = max([_[1] for _ in neighbours]) if start - distance < min_point: continue chunk = (max(0, start - 1 - distance), start - 1) seq = genome[chrom][chunk[0]:chunk[1]].seq description = "{}{}:{}-{}".format( chrom, strand, chunk[0], chunk[1]) seq = SeqRecord(Seq(seq), id="{}-prom-{}".format(gid, distance), description=description) print(seq.format("fasta"), file=out_files[distance], end='') except ValueError as err: logger.error( "Error extracting the promoter for %s, distance %d. Error:\n%s", gid, distance, err) continue logger.info("Finished") return
def __recalculate_hit(hit, boundary, minimal_overlap): """Static method to recalculate coverage/identity for new hits.""" __valid_matches = set([chr(x) for x in range(65, 91)] + [chr(x) for x in range(97, 123)] + ["|"]) hit_dict = dict() for key in iter(k for k in hit.keys() if k not in ("hsps", )): hit_dict[key] = hit[key] hsp_dict_list = [] # hit_dict["global_identity"] = [] q_intervals = [] t_intervals = [] identical_positions, positives = set(), set() best_hsp = (float("inf"), float("-inf")) for hsp in hit["hsps"]: _ = overlap((hsp["query_hsp_start"], hsp["query_hsp_end"]), boundary) if _ >= minimal_overlap * (boundary[1] + 1 - boundary[0]): hsp_dict_list.append(hsp) if hsp["hsp_evalue"] < best_hsp[0]: best_hsp = (hsp["hsp_evalue"], hsp["hsp_bits"]) q_intervals.append((hsp["query_hsp_start"], hsp["query_hsp_end"])) t_intervals.append( (hsp["target_hsp_start"], hsp["target_hsp_end"])) query_pos = hsp["query_hsp_start"] - 1 for amino in hsp["match"]: if amino in __valid_matches or amino == "+": query_pos += 1 positives.add(query_pos) if amino != "+": identical_positions.add(query_pos) elif amino == "_": # Gap in the target sequence query_pos += 1 if len(hsp_dict_list) == 0: return None q_merged_intervals = sorted(merge(q_intervals), key=operator.itemgetter(0, 1)) q_aligned = sum([tup[1] - tup[0] + 1 for tup in q_merged_intervals]) hit_dict["query_aligned_length"] = q_aligned hit_dict["query_start"] = q_merged_intervals[0][0] hit_dict["query_end"] = q_merged_intervals[-1][1] t_merged_intervals = sorted(merge(t_intervals), key=operator.itemgetter(0, 1)) t_aligned = sum([tup[1] - tup[0] + 1 for tup in t_merged_intervals]) hit_dict["target_aligned_length"] = t_aligned hit_dict["target_start"] = t_merged_intervals[0][0] hit_dict["target_end"] = t_merged_intervals[-1][1] hit_dict["global_identity"] = len(identical_positions) * 100 / q_aligned hit_dict["global_positives"] = len(positives) * 100 / q_aligned hit_dict["hsps"] = hsp_dict_list hit_dict["bits"] = max(x["hsp_bits"] for x in hit_dict["hsps"]) hit_dict["evalue"] = min(x["hsp_evalue"] for x in hit_dict["hsps"]) return hit_dict
def check_split_by_blast(transcript, cds_boundaries): """ This method verifies if a transcript with multiple ORFs has support by BLAST to NOT split it into its different components. The minimal overlap between ORF and HSP is defined inside the JSON at the key ["chimera_split"]["blast_params"]["minimal_hsp_overlap"] basically, we consider a HSP a hit only if the overlap is over a certain threshold and the HSP evalue under a certain threshold. The split by CDS can be executed in three different ways - PERMISSIVE, LENIENT, STRINGENT: - PERMISSIVE: split if two CDSs do not have hits in common, even when one or both do not have a hit at all. - STRINGENT: split only if two CDSs have hits and none of those is in common between them. - LENIENT: split if *both* lack hits, OR *both* have hits and none of those is in common. :param transcript: the transcript instance :type transcript: Mikado.loci_objects.transcript.Transcript :param cds_boundaries: :return: cds_boundaries :rtype: dict """ # Establish the minimum overlap between an ORF and a BLAST hit to consider it # to establish belongingness minimal_overlap = transcript.json_conf["pick"]["chimera_split"][ "blast_params"]["minimal_hsp_overlap"] cds_hit_dict = SortedDict().fromkeys(cds_boundaries.keys()) for key in cds_hit_dict: cds_hit_dict[key] = collections.defaultdict(list) # BUG, this is a hacky fix if not hasattr(transcript, "blast_hits"): transcript.logger.warning( "BLAST hits store lost for %s! Creating a mock one to avoid a crash", transcript.id) transcript.blast_hits = [] transcript.logger.debug("%s has %d possible hits", transcript.id, len(transcript.blast_hits)) # Determine for each CDS which are the hits available min_eval = transcript.json_conf["pick"]['chimera_split']['blast_params'][ 'hsp_evalue'] for hit in transcript.blast_hits: for hsp in iter(_hsp for _hsp in hit["hsps"] if _hsp["hsp_evalue"] <= min_eval): for cds_run in cds_boundaries: # If I have a valid hit b/w the CDS region and the hit, # add the name to the set overlap_threshold = minimal_overlap * (cds_run[1] + 1 - cds_run[0]) overl = overlap(cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end'])) if overl >= overlap_threshold: cds_hit_dict[cds_run][(hit["target"], hit["target_length"])].append(hsp) transcript.logger.debug( "Overlap %s passed for %s between %s CDS and %s HSP (threshold %s)", overlap, transcript.id, cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end']), overlap_threshold) else: transcript.logger.debug( "Overlap %s rejected for %s between %s CDS and %s HSP (threshold %s)", overlap, transcript.id, cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end']), overlap_threshold) transcript.logger.debug("Final cds_hit_dict for %s: %s", transcript.id, cds_hit_dict) final_boundaries = SortedDict() for boundary in __get_boundaries_from_blast(transcript, cds_boundaries, cds_hit_dict): if len(boundary) == 1: assert len(boundary[0]) == 2 boundary = boundary[0] final_boundaries[boundary] = cds_boundaries[boundary] else: nboun = (boundary[0][0], boundary[-1][1]) final_boundaries[nboun] = [] for boun in boundary: final_boundaries[nboun].extend(cds_boundaries[boun]) transcript.logger.debug("Final boundaries for %s: %s", transcript.id, final_boundaries) cds_boundaries = final_boundaries.copy() return cds_boundaries