Example #1
0
 def is_before(gid_coords, fkey, fstrand):
     if fstrand == "-":
         return (utilities.overlap(gid_coords, fkey) >=
                 0) or gid_coords[1] < fkey[0]
     else:
         return (utilities.overlap(gid_coords, fkey) >=
                 0) or gid_coords[0] > fkey[1]
Example #2
0
def __check_collisions(transcript, nspan, spans):

    """
    This method checks whether a new transcript collides with a previously
    defined transcript.
    :param nspan:
    :param spans:
    :return:
    """

    if len(spans) == 0:
        return
    for span in spans:
        overl = overlap(span, nspan)

        transcript.logger.debug(
            "Comparing start-ends for split of %s. SpanA: %s SpanB: %s Overlap: %d",
            transcript.id, span,
            nspan, overl)

        if overl > 0:
            err_message = "Invalid overlap for {0}! T1: {1}. T2: {2}".format(
                transcript.id, span, nspan)
            transcript.logger.error(err_message)
            raise InvalidTranscript(err_message)
Example #3
0
    def test_overlap(self):
        """
        Test for overlap function
        :return:
        """

        self.assertEqual(Abstractlocus.overlap((100, 200), (100, 200)), 100)
        self.assertEqual(Abstractlocus.overlap((100, 200), (100, 200)),
                         overlap((100, 200), (100, 200)))
Example #4
0
    def test_noCDSOverlap(self):

        self.t1.strip_cds()
        self.assertEqual(self.t1.combined_cds_introns, set())
        self.t1.finalized = False
        self.t1.add_exons([(401, 500), (601, 700), (1001, 1100)], "CDS")
        self.t1.finalize()

        t2 = Transcript()
        t2.logger = self.logger
        t2.chrom = "Chr1"
        t2.strand = "+"
        t2.score = 1
        t2.id = "G2.1"
        t2.parent = "G2"
        t2.start = 101
        t2.end = 1470
        t2.add_exons([(101, 510), (601, 700), (960, 1350), (1421, 1470)])
        t2.add_exons([(1201, 1350), (1421, 1450)], "CDS")
        t2.finalize()

        self.assertTrue(self.t1.is_coding)
        self.assertTrue(t2.is_coding)

        self.assertGreaterEqual(
            0,
            overlap((self.t1.combined_cds_start, self.t1.combined_cds_end),
                    (t2.combined_cds_start, t2.combined_cds_end)),
            [(self.t1.combined_cds_start, self.t1.combined_cds_end),
             (t2.combined_cds_start, t2.combined_cds_end)])

        self.assertTrue(
            MonosublocusHolder.is_intersecting(self.t1, t2,
                                               logger=self.logger))
        self.assertFalse(
            MonosublocusHolder.is_intersecting(self.t1,
                                               t2,
                                               cds_only=True,
                                               logger=self.logger))
Example #5
0
def __load_blast_hits(new_transcript, boundary, transcript):

    """
    Function to load the BLAST hits into the new splitted transcript.
    :param new_transcript: the splitted transcript
    :type new_transcript: Mikado.loci_objects.Transcript
    :param boundary: tuple(start, end) of the boundary of the new transcript
    :type boundary: tuple(int, int)
    :param transcript:  the original transcript
    :type transcript: Mikado.loci_objects.Transcript
    :return:
    """

    for hit in transcript.blast_hits:
        if overlap((hit["query_start"], hit["query_end"]), boundary) > 0:

            minimal_overlap = transcript.json_conf[
                "pick"]["chimera_split"]["blast_params"]["minimal_hsp_overlap"]
            new_hit = __recalculate_hit(hit, boundary, minimal_overlap)
            if new_hit is not None:
                transcript.logger.debug("""Hit %s,
                                        previous id/query_al_length/t_al_length %f/%f/%f,
                                        novel %f/%f/%f""",
                                        new_hit["target"],
                                        hit["global_identity"],
                                        hit["query_aligned_length"],
                                        hit["target_aligned_length"],
                                        new_hit["global_identity"],
                                        new_hit["query_aligned_length"],
                                        new_hit["target_aligned_length"])

                new_transcript.blast_hits.append(new_hit)
            else:
                transcript.logger.debug("Hit %s did not pass overlap checks for %s",
                                        hit["target"], new_transcript.id)
        else:
            transcript.logger.debug("Ignoring hit %s as it is not intersecting", hit)
            continue
Example #6
0
def __check_collisions(transcript, nspan, spans):
    """
    This method checks whether a new transcript collides with a previously
    defined transcript.
    :param nspan:
    :param spans:
    :return:
    """

    if len(spans) == 0:
        return
    for span in spans:
        overl = overlap(span, nspan)

        transcript.logger.debug(
            "Comparing start-ends for split of %s. SpanA: %s SpanB: %s Overlap: %d",
            transcript.id, span, nspan, overl)

        if overl > 0:
            err_message = "Invalid overlap for {0}! T1: {1}. T2: {2}".format(
                transcript.id, span, nspan)
            transcript.logger.error(err_message)
            raise InvalidTranscript(err_message)
Example #7
0
def __load_blast_hits(new_transcript, boundary, transcript):
    """
    Function to load the BLAST hits into the new splitted transcript.
    :param new_transcript: the splitted transcript
    :type new_transcript: Mikado.loci_objects.Transcript
    :param boundary: tuple(start, end) of the boundary of the new transcript
    :type boundary: tuple(int, int)
    :param transcript:  the original transcript
    :type transcript: Mikado.loci_objects.Transcript
    :return:
    """

    for hit in transcript.blast_hits:
        if overlap((hit["query_start"], hit["query_end"]), boundary) > 0:

            minimal_overlap = transcript.json_conf["pick"]["chimera_split"][
                "blast_params"]["minimal_hsp_overlap"]
            new_hit = __recalculate_hit(hit, boundary, minimal_overlap)
            if new_hit is not None:
                transcript.logger.debug(
                    """Hit %s,
                                        previous id/query_al_length/t_al_length %f/%f/%f,
                                        novel %f/%f/%f""", new_hit["target"],
                    hit["global_identity"], hit["query_aligned_length"],
                    hit["target_aligned_length"], new_hit["global_identity"],
                    new_hit["query_aligned_length"],
                    new_hit["target_aligned_length"])

                new_transcript.blast_hits.append(new_hit)
            else:
                transcript.logger.debug(
                    "Hit %s did not pass overlap checks for %s", hit["target"],
                    new_transcript.id)
        else:
            transcript.logger.debug(
                "Ignoring hit %s as it is not intersecting", hit)
            continue
Example #8
0
def __recalculate_hit(hit, boundary, minimal_overlap):
    """Static method to recalculate coverage/identity for new hits."""

    __valid_matches = set([chr(x) for x in range(65, 91)] + [chr(x) for x in range(97, 123)] +
                          ["|"])

    hit_dict = dict()
    for key in iter(k for k in hit.keys() if k not in ("hsps",)):
        hit_dict[key] = hit[key]

    hsp_dict_list = []
    # hit_dict["global_identity"] = []
    q_intervals = []
    t_intervals = []

    identical_positions, positives = set(), set()

    best_hsp = (float("inf"), float("-inf"))

    for hsp in hit["hsps"]:
        _ = overlap((hsp["query_hsp_start"], hsp["query_hsp_end"]), boundary)
        if _ >= minimal_overlap * (boundary[1] + 1 - boundary[0]):
            hsp_dict_list.append(hsp)
            if hsp["hsp_evalue"] < best_hsp[0]:
                best_hsp = (hsp["hsp_evalue"], hsp["hsp_bits"])

            q_intervals.append((hsp["query_hsp_start"], hsp["query_hsp_end"]))
            t_intervals.append((hsp["target_hsp_start"], hsp["target_hsp_end"]))

            query_pos = hsp["query_hsp_start"] - 1

            for amino in hsp["match"]:
                if amino in __valid_matches or amino == "+":
                    query_pos += 1
                    positives.add(query_pos)
                    if amino != "+":
                        identical_positions.add(query_pos)
                elif amino == "_":  # Gap in the target sequence
                    query_pos += 1

    if len(hsp_dict_list) == 0:
        return None

    q_merged_intervals = sorted(merge(q_intervals), key=operator.itemgetter(0, 1))
    q_aligned = sum([tup[1] - tup[0] + 1 for tup in q_merged_intervals])
    hit_dict["query_aligned_length"] = q_aligned
    hit_dict["query_start"] = q_merged_intervals[0][0]
    hit_dict["query_end"] = q_merged_intervals[-1][1]

    t_merged_intervals = sorted(merge(t_intervals), key=operator.itemgetter(0, 1))
    t_aligned = sum([tup[1] - tup[0] + 1 for tup in t_merged_intervals])
    hit_dict["target_aligned_length"] = t_aligned
    hit_dict["target_start"] = t_merged_intervals[0][0]
    hit_dict["target_end"] = t_merged_intervals[-1][1]
    hit_dict["global_identity"] = len(identical_positions) * 100 / q_aligned
    hit_dict["global_positives"] = len(positives) * 100 / q_aligned
    hit_dict["hsps"] = hsp_dict_list
    hit_dict["bits"] = max(x["hsp_bits"] for x in hit_dict["hsps"])
    hit_dict["evalue"] = min(x["hsp_evalue"] for x in hit_dict["hsps"])

    return hit_dict
Example #9
0
def check_split_by_blast(transcript, cds_boundaries):

    """
    This method verifies if a transcript with multiple ORFs has support by BLAST to
    NOT split it into its different components.

    The minimal overlap between ORF and HSP is defined inside the JSON at the key
        ["chimera_split"]["blast_params"]["minimal_hsp_overlap"]
    basically, we consider a HSP a hit only if the overlap is over a certain threshold
    and the HSP evalue under a certain threshold.

    The split by CDS can be executed in three different ways - PERMISSIVE, LENIENT, STRINGENT:

    - PERMISSIVE: split if two CDSs do not have hits in common,
    even when one or both do not have a hit at all.
    - STRINGENT: split only if two CDSs have hits and none
    of those is in common between them.
    - LENIENT: split if *both* lack hits, OR *both* have hits and none
    of those is in common.

    :param transcript: the transcript instance
    :type transcript: Mikado.loci_objects.transcript.Transcript
    :param cds_boundaries:
    :return: cds_boundaries
    :rtype: dict
    """

    # Establish the minimum overlap between an ORF and a BLAST hit to consider it
    # to establish belongingness

    minimal_overlap = transcript.json_conf[
        "pick"]["chimera_split"]["blast_params"]["minimal_hsp_overlap"]

    cds_hit_dict = SortedDict().fromkeys(cds_boundaries.keys())
    for key in cds_hit_dict:
        cds_hit_dict[key] = collections.defaultdict(list)

    # BUG, this is a hacky fix
    if not hasattr(transcript, "blast_hits"):
        transcript.logger.warning(
            "BLAST hits store lost for %s! Creating a mock one to avoid a crash",

            transcript.id)
        transcript.blast_hits = []

    transcript.logger.debug("%s has %d possible hits", transcript.id, len(transcript.blast_hits))

    # Determine for each CDS which are the hits available
    min_eval = transcript.json_conf["pick"]['chimera_split']['blast_params']['hsp_evalue']
    for hit in transcript.blast_hits:
        for hsp in iter(_hsp for _hsp in hit["hsps"] if
                        _hsp["hsp_evalue"] <= min_eval):
            for cds_run in cds_boundaries:
                # If I have a valid hit b/w the CDS region and the hit,
                # add the name to the set
                overlap_threshold = minimal_overlap * (cds_run[1] + 1 - cds_run[0])
                overl = overlap(cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end']))

                if overl >= overlap_threshold:
                    cds_hit_dict[cds_run][(hit["target"], hit["target_length"])].append(hsp)
                    transcript.logger.debug(
                        "Overlap %s passed for %s between %s CDS and %s HSP (threshold %s)",
                        overlap,
                        transcript.id,
                        cds_run,
                        (hsp['query_hsp_start'], hsp['query_hsp_end']),
                        overlap_threshold)
                else:
                    transcript.logger.debug(
                        "Overlap %s rejected for %s between %s CDS and %s HSP (threshold %s)",
                        overlap,
                        transcript.id,
                        cds_run, (hsp['query_hsp_start'], hsp['query_hsp_end']),
                        overlap_threshold)

    transcript.logger.debug("Final cds_hit_dict for %s: %s", transcript.id, cds_hit_dict)

    final_boundaries = SortedDict()
    for boundary in __get_boundaries_from_blast(transcript, cds_boundaries, cds_hit_dict):
        if len(boundary) == 1:
            assert len(boundary[0]) == 2
            boundary = boundary[0]
            final_boundaries[boundary] = cds_boundaries[boundary]
        else:
            nboun = (boundary[0][0], boundary[-1][1])
            final_boundaries[nboun] = []
            for boun in boundary:
                final_boundaries[nboun].extend(cds_boundaries[boun])
    transcript.logger.debug("Final boundaries for %s: %s",
                            transcript.id, final_boundaries)

    cds_boundaries = final_boundaries.copy()
    return cds_boundaries
def create_transcript(tid: str, parent: str, lines: List[GtfLine],
                      args: argparse.Namespace):
    """"""

    chroms = defaultdict(list)
    for line in lines:
        chroms[line.chrom].append(line)

    if len(chroms) > 1:
        # Recursively
        for chrom in chroms:
            newtid = tid + "." + chrom
            newparent = parent + "." + chrom
            for transcript in create_transcript(newtid, newparent,
                                                chroms[chrom], args):
                assert transcript.id == newtid, (newtid, transcript.id)
                assert transcript.parent[0] == newparent
                yield transcript
    else:
        # Now we are sure that we only have one chromosome
        exons = sorted([line for line in lines if line.is_exon],
                       key=operator.attrgetter("chrom", "start", "end"))

        if len(exons) == 1:
            transcript = Transcript(exons[0])
            transcript.id = tid
            transcript.parent = parent
            transcript.finalize()
            yield transcript
        else:
            new_exons = deque()
            identifier = ord("A") - 1
            current = exons[0]

            for exon in exons[1:]:
                if ((overlap((exon.start, exon.end),
                             (current.start, current.end)) > 0)
                        or (exon.start - current.end + 1 <= args.min_intron
                            and args.split is False)):
                    # Merge the two exons
                    current.end = exon.end
                elif ((exon.start - current.end + 1 <= args.min_intron
                       and args.split is True)
                      or exon.start - current.end + 1 > args.max_intron):
                    # TODO: split
                    new_exons.append(current)
                    transcript = Transcript(new_exons.popleft())
                    transcript.add_exons(new_exons)
                    transcript.finalize()
                    identifier += 1
                    transcript.parent = parent + "." + chr(identifier)
                    transcript.id = tid + "." + chr(identifier)
                    yield transcript
                    current = exon
                    new_exons = deque()
                else:
                    new_exons.append(current)
                    current = exon

            new_exons.append(current)
            transcript = Transcript(new_exons.popleft())
            transcript.add_exons(new_exons)

            if identifier == ord("A") - 1:
                transcript.id = tid
                transcript.parent = parent
            else:
                identifier += 1
                transcript.id = tid + "." + chr(identifier)
                transcript.parent = parent + "." + chr(identifier)

            transcript.finalize()
            yield transcript
Example #11
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument("-o", "--out", type=str, default="promoters")
    parser.add_argument("-l", "--log", default=None)
    parser.add_argument("-lv",
                        "--log-level",
                        default="WARN",
                        choices=["DEBUG", "INFO", "WARN", "ERROR", "CRITICAL"],
                        dest="log_level")
    parser.add_argument("-d",
                        "--distances",
                        nargs="+",
                        type=int,
                        default=[1000, 2000, 5000])
    parser.add_argument(
        "-nn",
        "--no-neighbours",
        dest="no_neighbours",
        action="store_true",
        default=False,
        help="Ignore the presence of neighbours when extracting genes.")
    parser.add_argument("-eu",
                        "--exclude-utr",
                        dest="exclude_utr",
                        default=False,
                        action="store_true")
    parser.add_argument("-z",
                        "--gzip",
                        default=False,
                        action="store_true",
                        help="Output will be compressed in GZip format.")
    parser.add_argument("genome")
    parser.add_argument("gff3")
    parser.add_argument("gene_list")
    args = parser.parse_args()

    logging.basicConfig(
        filename=args.log,
        format="{asctime} - {name} - {filename}:{lineno} - {levelname} - "
        "{funcName} - {processName} - {message}",
        style="{",
        level=args.log_level)
    logger = logging.getLogger('extract_promoter_regions')

    max_distance = max(args.distances)
    out_files = dict()
    args.distances = sorted([_ for _ in args.distances if _ > 0])
    if not args.distances:
        exc = ValueError("I need at least one positive integer distance!")
        logger.exception(exc)
        sys.exit(1)
    for distance in args.distances:
        if args.gzip is True:
            out_files[distance] = gzip.open(
                "{}-{}bp.fasta.gz".format(
                    os.path.splitext(args.out)[0], distance), "wt")
        else:
            out_files[distance] = open(
                "{}-{}bp.fasta".format(
                    os.path.splitext(args.out)[0], distance), "wt")

    logger.info("Starting to load the genome")
    genome = pyfaidx.Fasta(args.genome)
    logger.info("Loaded the genome")

    logger.info("Starting to load the GFF3 index")
    with open(args.gff3) as gff3:
        namespace = argparse.Namespace
        namespace.reference = gff3
        namespace.exclude_utr = args.exclude_utr
        namespace.protein_coding = False
        # Use Mikado compare functions to load the index from the GFF3
        # "genes" is a dictionary of Gene objects, having as keys the gene names
        # "positions" is a dictionary of the form: [chrom][(start, end)] = [GID1, GID2, ...]
        genes, positions = load_index(namespace, logger)
        # Create a dictionary of interval trees, one per chromosome
        indexer = collections.defaultdict(list).fromkeys(positions)
        for chrom in indexer:
            indexer[chrom] = IntervalTree.from_tuples(positions[chrom].keys())
    logger.info("Loaded the index")

    with open(args.gene_list) as gene_list:
        gids = [_.rstrip() for _ in gene_list]
        logger.info("Starting to extract sequences for {} genes".format(
            len(gids)))
        for gid in gids:
            if gid not in genes:
                exc = IndexError("{} not found in the index!".format(gid))
                logger.exception(exc)
                continue
            chrom, start, end, strand = (genes[gid].chrom, genes[gid].start,
                                         genes[gid].end, genes[gid].strand)
            if chrom not in genome:
                exc = IndexError(
                    "Chromosome {} not found in the genome!".format(chrom))
                logger.exception(exc)
                continue

            # If the gene is on the minus strand, the promoter is further down
            if strand == "-":
                key = (start, min(end + max_distance, len(genome[chrom])))
            else:
                # otherwise it is on the 5' side
                key = (max(0, start - max_distance), end)

            # Find all genes which are near
            if args.no_neighbours is False:

                neighbours = Assigner.find_neighbours(indexer.get(
                    chrom, IntervalTree()),
                                                      key,
                                                      distance=0)

                # This is a list of the form [((start, end), distance), ...] where "(start, end)" is a key for the
                # "positions" dictionary, above

                # Find all the genes which are in the neighbourhood, remove the obvious case of the identity ..
                def is_before(gid_coords, fkey, fstrand):
                    if fstrand == "-":
                        return (utilities.overlap(gid_coords, fkey) >=
                                0) or gid_coords[1] < fkey[0]
                    else:
                        return (utilities.overlap(gid_coords, fkey) >=
                                0) or gid_coords[0] > fkey[1]

                neighbours = [
                    _[0] for _ in neighbours
                    if is_before((start, end), _[0], strand)
                    and gid not in positions[chrom][_[0]]
                ]
            else:
                neighbours = []

            if not neighbours:
                # No neighbours found, we can grab everything
                for distance in args.distances:
                    try:
                        if strand == "-":
                            chunk = (max(0, end),
                                     min(end + distance, len(genome[chrom])))
                            seq = genome[chrom][
                                chunk[0]:chunk[1]].reverse.complement.seq
                        else:
                            chunk = (max(0, start - 1 - distance), start - 1)
                            seq = genome[chrom][chunk[0]:chunk[1]].seq

                        seq = SeqRecord(Seq(seq),
                                        id="{}-prom-{}".format(gid, distance),
                                        description="{}{}:{}-{}".format(
                                            chrom, strand, chunk[0], chunk[1]))
                        print(seq.format("fasta"),
                              file=out_files[distance],
                              end='')
                    except ValueError as err:
                        logger.error(
                            "Error extracting the promoter for %s, distance %d. Error:\n%s",
                            gid, distance, err)
                        continue
            else:
                # We have some neighbours, we have to select the maximum distance we can go to
                logger.warning("{} neighbours found for {}: {}".format(
                    len(neighbours), gid, neighbours))
                if any([
                        utilities.overlap((start, end), _) >= 0
                        for _ in neighbours
                ]):
                    logger.warning(
                        "Overlapping genes found for {}. Skipping".format(gid))
                    continue
                for distance in args.distances:
                    try:
                        if strand == "-":
                            max_point = min([_[0] for _ in neighbours])
                            if end + distance > max_point:
                                continue
                            chunk = (max(0, end),
                                     min(
                                         max_point,
                                         min(end + distance,
                                             len(genome[chrom]))))
                            seq = genome[chrom][
                                chunk[0]:chunk[1]].reverse.complement.seq
                            description = "{}{}:{}-{}".format(
                                chrom, strand, chunk[1], chunk[0])
                        else:
                            min_point = max([_[1] for _ in neighbours])
                            if start - distance < min_point:
                                continue
                            chunk = (max(0, start - 1 - distance), start - 1)
                            seq = genome[chrom][chunk[0]:chunk[1]].seq
                            description = "{}{}:{}-{}".format(
                                chrom, strand, chunk[0], chunk[1])
                        seq = SeqRecord(Seq(seq),
                                        id="{}-prom-{}".format(gid, distance),
                                        description=description)
                        print(seq.format("fasta"),
                              file=out_files[distance],
                              end='')

                    except ValueError as err:
                        logger.error(
                            "Error extracting the promoter for %s, distance %d. Error:\n%s",
                            gid, distance, err)
                        continue

    logger.info("Finished")
    return
Example #12
0
def __recalculate_hit(hit, boundary, minimal_overlap):
    """Static method to recalculate coverage/identity for new hits."""

    __valid_matches = set([chr(x) for x in range(65, 91)] +
                          [chr(x) for x in range(97, 123)] + ["|"])

    hit_dict = dict()
    for key in iter(k for k in hit.keys() if k not in ("hsps", )):
        hit_dict[key] = hit[key]

    hsp_dict_list = []
    # hit_dict["global_identity"] = []
    q_intervals = []
    t_intervals = []

    identical_positions, positives = set(), set()

    best_hsp = (float("inf"), float("-inf"))

    for hsp in hit["hsps"]:
        _ = overlap((hsp["query_hsp_start"], hsp["query_hsp_end"]), boundary)
        if _ >= minimal_overlap * (boundary[1] + 1 - boundary[0]):
            hsp_dict_list.append(hsp)
            if hsp["hsp_evalue"] < best_hsp[0]:
                best_hsp = (hsp["hsp_evalue"], hsp["hsp_bits"])

            q_intervals.append((hsp["query_hsp_start"], hsp["query_hsp_end"]))
            t_intervals.append(
                (hsp["target_hsp_start"], hsp["target_hsp_end"]))

            query_pos = hsp["query_hsp_start"] - 1

            for amino in hsp["match"]:
                if amino in __valid_matches or amino == "+":
                    query_pos += 1
                    positives.add(query_pos)
                    if amino != "+":
                        identical_positions.add(query_pos)
                elif amino == "_":  # Gap in the target sequence
                    query_pos += 1

    if len(hsp_dict_list) == 0:
        return None

    q_merged_intervals = sorted(merge(q_intervals),
                                key=operator.itemgetter(0, 1))
    q_aligned = sum([tup[1] - tup[0] + 1 for tup in q_merged_intervals])
    hit_dict["query_aligned_length"] = q_aligned
    hit_dict["query_start"] = q_merged_intervals[0][0]
    hit_dict["query_end"] = q_merged_intervals[-1][1]

    t_merged_intervals = sorted(merge(t_intervals),
                                key=operator.itemgetter(0, 1))
    t_aligned = sum([tup[1] - tup[0] + 1 for tup in t_merged_intervals])
    hit_dict["target_aligned_length"] = t_aligned
    hit_dict["target_start"] = t_merged_intervals[0][0]
    hit_dict["target_end"] = t_merged_intervals[-1][1]
    hit_dict["global_identity"] = len(identical_positions) * 100 / q_aligned
    hit_dict["global_positives"] = len(positives) * 100 / q_aligned
    hit_dict["hsps"] = hsp_dict_list
    hit_dict["bits"] = max(x["hsp_bits"] for x in hit_dict["hsps"])
    hit_dict["evalue"] = min(x["hsp_evalue"] for x in hit_dict["hsps"])

    return hit_dict
Example #13
0
def check_split_by_blast(transcript, cds_boundaries):
    """
    This method verifies if a transcript with multiple ORFs has support by BLAST to
    NOT split it into its different components.

    The minimal overlap between ORF and HSP is defined inside the JSON at the key
        ["chimera_split"]["blast_params"]["minimal_hsp_overlap"]
    basically, we consider a HSP a hit only if the overlap is over a certain threshold
    and the HSP evalue under a certain threshold.

    The split by CDS can be executed in three different ways - PERMISSIVE, LENIENT, STRINGENT:

    - PERMISSIVE: split if two CDSs do not have hits in common,
    even when one or both do not have a hit at all.
    - STRINGENT: split only if two CDSs have hits and none
    of those is in common between them.
    - LENIENT: split if *both* lack hits, OR *both* have hits and none
    of those is in common.

    :param transcript: the transcript instance
    :type transcript: Mikado.loci_objects.transcript.Transcript
    :param cds_boundaries:
    :return: cds_boundaries
    :rtype: dict
    """

    # Establish the minimum overlap between an ORF and a BLAST hit to consider it
    # to establish belongingness

    minimal_overlap = transcript.json_conf["pick"]["chimera_split"][
        "blast_params"]["minimal_hsp_overlap"]

    cds_hit_dict = SortedDict().fromkeys(cds_boundaries.keys())
    for key in cds_hit_dict:
        cds_hit_dict[key] = collections.defaultdict(list)

    # BUG, this is a hacky fix
    if not hasattr(transcript, "blast_hits"):
        transcript.logger.warning(
            "BLAST hits store lost for %s! Creating a mock one to avoid a crash",
            transcript.id)
        transcript.blast_hits = []

    transcript.logger.debug("%s has %d possible hits", transcript.id,
                            len(transcript.blast_hits))

    # Determine for each CDS which are the hits available
    min_eval = transcript.json_conf["pick"]['chimera_split']['blast_params'][
        'hsp_evalue']
    for hit in transcript.blast_hits:
        for hsp in iter(_hsp for _hsp in hit["hsps"]
                        if _hsp["hsp_evalue"] <= min_eval):
            for cds_run in cds_boundaries:
                # If I have a valid hit b/w the CDS region and the hit,
                # add the name to the set
                overlap_threshold = minimal_overlap * (cds_run[1] + 1 -
                                                       cds_run[0])
                overl = overlap(cds_run,
                                (hsp['query_hsp_start'], hsp['query_hsp_end']))

                if overl >= overlap_threshold:
                    cds_hit_dict[cds_run][(hit["target"],
                                           hit["target_length"])].append(hsp)
                    transcript.logger.debug(
                        "Overlap %s passed for %s between %s CDS and %s HSP (threshold %s)",
                        overlap, transcript.id, cds_run,
                        (hsp['query_hsp_start'], hsp['query_hsp_end']),
                        overlap_threshold)
                else:
                    transcript.logger.debug(
                        "Overlap %s rejected for %s between %s CDS and %s HSP (threshold %s)",
                        overlap, transcript.id, cds_run,
                        (hsp['query_hsp_start'], hsp['query_hsp_end']),
                        overlap_threshold)

    transcript.logger.debug("Final cds_hit_dict for %s: %s", transcript.id,
                            cds_hit_dict)

    final_boundaries = SortedDict()
    for boundary in __get_boundaries_from_blast(transcript, cds_boundaries,
                                                cds_hit_dict):
        if len(boundary) == 1:
            assert len(boundary[0]) == 2
            boundary = boundary[0]
            final_boundaries[boundary] = cds_boundaries[boundary]
        else:
            nboun = (boundary[0][0], boundary[-1][1])
            final_boundaries[nboun] = []
            for boun in boundary:
                final_boundaries[nboun].extend(cds_boundaries[boun])
    transcript.logger.debug("Final boundaries for %s: %s", transcript.id,
                            final_boundaries)

    cds_boundaries = final_boundaries.copy()
    return cds_boundaries