Esempio n. 1
0
 def split_match(fragments, start=0, end=-1):
     n = len(fragments)
     # print(f"GALVEZ: n={n}")
     # import sys; sys.stdout.flush()
     if n < 1:
         return
     elif n == 1:
         weighted_fragments = [(0, fragments[0])]
     else:
         # so we later know the original index of each fragment
         weighted_fragments = enumerate(fragments)
         # assigns high values to long statements near the center of the list
         weighted_fragments = enweight(weighted_fragments)
         weighted_fragments = map(
             lambda fw: (fw[0], (1 - fw[1]) * len(fw[0][1]["transcript"])),
             weighted_fragments,
         )
         # fragments with highest weights first
         weighted_fragments = sorted(weighted_fragments,
                                     key=lambda fw: fw[1],
                                     reverse=True)
         # strip weights
         weighted_fragments = list(map(lambda fw: fw[0],
                                       weighted_fragments))
     for index, fragment in weighted_fragments:
         # find_best?
         match = search.find_best(fragment["transcript"],
                                  start=start,
                                  end=end)
         match_start, match_end, sws_score, match_substitutions = match
         # At least half must overlap...
         # print(f"GALVEZ: sws_score={sws_score}")
         # import sys; sys.stdout.flush()
         # Maybe what I need to do is require this score to be higher?
         # The problem is that I don't know how to decrease this...
         # If score > n / (2n). So basically >0.5, right?
         if sws_score > (n - 1) / (2 * n):
             # print(f"GALVEZ: sws passed sws_score={sws_score}")
             # import sys; sys.stdout.flush()
             fragment["match-start"] = match_start
             fragment["match-end"] = match_end
             fragment["sws"] = sws_score
             fragment["substitutions"] = match_substitutions
             # Here's the recursive joining, is that right?
             # What does this do?
             for f in split_match(fragments[0:index],
                                  start=start,
                                  end=match_start):
                 yield f
             yield fragment
             for f in split_match(fragments[index + 1:],
                                  start=match_end,
                                  end=end):
                 yield f
             return
     for _, _ in weighted_fragments:
         yield None
Esempio n. 2
0
def weighted_ngrams(s, size, direction=0):
    """
    Lists all appearances of all N-grams of a string from left to right together with a positional weight value.
    The positional weight progresses quadratically.
    :param s: String to decompose
    :param size: N-gram size
    :param direction: Order of assigning positional weights to N-grams:
        direction < 0: Weight of first N-gram is 1.0 and of last one 0.0
        direction > 0: Weight of first N-gram is 0.0 and of last one 1.0
        direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0
    :return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value
    """
    return enweight(ngrams(s, size), direction=direction)
Esempio n. 3
0
 def split_match(fragments, start=0, end=-1):
     n = len(fragments)
     if n < 1:
         return
     elif n == 1:
         weighted_fragments = [(0, fragments[0])]
     else:
         # so we later know the original index of each fragment
         weighted_fragments = enumerate(fragments)
         # assigns high values to long statements near the center of the list
         weighted_fragments = enweight(weighted_fragments)
         weighted_fragments = map(
             lambda fw: (fw[0], (1 - fw[1]) * len(fw[0][1]["transcript"])),
             weighted_fragments,
         )
         # fragments with highest weights first
         weighted_fragments = sorted(weighted_fragments,
                                     key=lambda fw: fw[1],
                                     reverse=True)
         # strip weights
         weighted_fragments = list(map(lambda fw: fw[0],
                                       weighted_fragments))
     for index, fragment in weighted_fragments:
         match = search.find_best(fragment["transcript"],
                                  start=start,
                                  end=end)
         match_start, match_end, sws_score, match_substitutions = match
         if sws_score > (n - 1) / (2 * n):
             fragment["match-start"] = match_start
             fragment["match-end"] = match_end
             fragment["sws"] = sws_score
             fragment["substitutions"] = match_substitutions
             # Here's the recursive joining, is that right?
             # This is an exponential time algorithm?
             for f in split_match(fragments[0:index],
                                  start=start,
                                  end=match_start):
                 yield f
             yield fragment
             for f in split_match(fragments[index + 1:],
                                  start=match_end,
                                  end=end):
                 yield f
             return
     for _, _ in weighted_fragments:
         yield None
Esempio n. 4
0
 def split_match(fragments, start=0, end=-1):
     n = len(fragments)
     if n < 1:
         raise StopIteration
     elif n == 1:
         weighted_fragments = [(0, fragments[0])]
     else:
         # so we later know the original index of each fragment
         weighted_fragments = enumerate(fragments)
         # assigns high values to long statements near the center of the list
         weighted_fragments = enweight(weighted_fragments)
         weighted_fragments = map(
             lambda fw: (fw[0], (1 - fw[1]) * len(fw[0][1]['transcript'])),
             weighted_fragments)
         # fragments with highest weights first
         weighted_fragments = sorted(weighted_fragments,
                                     key=lambda fw: fw[1],
                                     reverse=True)
         # strip weights
         weighted_fragments = list(map(lambda fw: fw[0],
                                       weighted_fragments))
     for index, fragment in weighted_fragments:
         match = search.find_best(fragment['transcript'],
                                  start=start,
                                  end=end)
         match_start, match_end, sws_score, match_substitutions = match
         if sws_score > (n - 1) / (2 * n):
             fragment['match-start'] = match_start
             fragment['match-end'] = match_end
             fragment['sws'] = sws_score
             fragment['substitutions'] = match_substitutions
             for f in split_match(fragments[0:index],
                                  start=start,
                                  end=match_start):
                 yield f
             yield fragment
             for f in split_match(fragments[index + 1:],
                                  start=match_end,
                                  end=end):
                 yield f
             raise StopIteration
     for _, _ in weighted_fragments:
         yield None