Beispiel #1
0
 def traverse_span(span, entities_set):
     """
     traverse span of word tokens until we find a word which isn't any entity
     :return: entity found in the span and number of words in the entity
     """
     candidate = span[0]
     index = 1
     while index < len(span) and join_strings(candidate,
                                              span[index]) in entities_set:
         candidate = join_strings(candidate, span[index])
         index += 1
     return index, candidate
Beispiel #2
0
 def traverse_span(span, entities_set):
     """
     traverse span of word tokens and concatenate it until the actual concatenation 
     of words isn't part of the entities_set
     :return: entity found in the span and number of words in the entity
     """
     candidate = span[0]
     index = 1
     while index < len(span) and join_strings(candidate,
                                              span[index]) in entities_set:
         candidate = join_strings(candidate, span[index])
         index += 1
     return index, candidate
Beispiel #3
0
 def look(self, player):
     # show name and description
     if self.name:
         player.tell('*** {name} ***'.format(name=self.name))
     player.tell(self.description or 'You see nothing here.')
     # show exits
     if self.exits:
         directions = self.exits.keys()
         player.tell('You can go {directions}.'.format(
             directions=join_strings(directions, 'or')))
     # show other players
     players = [p.name for p in self.players if p != player]
     if players:
         player.tell('{players} {are} here.'.format(
             players=join_strings(players, 'and'),
             are=len(players) > 1 and 'are' or 'is'))
     # show room contents
     things = [o.name for o in self.things]
     if things:
         player.tell('There is {names} here.'.format(
             names=join_strings(things, 'and')))
Beispiel #4
0
 def get_entities_from_summary(self, entities_set):
     """
     Traverse the summary and try to extract all the named entities present in it
     - problem: all the substrings present in the summary must be in the entities_set, therefore
     if we search for "Luc Mbah a Moute" then {"Luc", "Luc Mbah", "Luc Mbah a", "Luc Mbah a Moute"} must
     be a subset of the entities set
     :return: list with all the extracted named entities
     """
     summary = join_strings(*self._list_of_words)
     extracted = []
     for s in nltk_tok.sent_tokenize(summary):
         extracted += self.extract_entities(s, entities_set)
     return extracted
Beispiel #5
0
 def get_entities_from_summary(self, entities_set):
     """
     Traverse the summary and try to extract all the named entities present in it
     - problem: all the substrings present in the summary must be in the entities_set, therefore
     if we search for "Stephen Curry" both "Stephen" and "Stephen Curry" must be present in the
     entities_set
     -----
     :return: list with all the extracted named entities
     """
     summary = join_strings(*self._list_of_words)
     extracted = []
     for s in nltk_tok.sent_tokenize(summary):
         extracted += self.extract_entities(s, entities_set)
     return extracted
Beispiel #6
0
    def _transform_words(list_of_words, words_limit=None):
        """ Traverse through the summary and transform dataset faults
        
        E.g. we transform Barea’s to Barea ’s, all the version of name Luc Mbah A Moute to Moute, all the number
        words to numerals etc. """
        summary = join_strings(*list_of_words)
        sentences = [
            Summary.transform_numbers(s)
            for s in nltk_tok.sent_tokenize(summary)
        ]
        result = []
        for s in sentences:
            tokens = []
            # transform possessives
            for token in s.strip().split():
                if token.endswith('’s'):
                    tokens.append(token.replace('’s', ''))
                    tokens.append("’s")
                else:
                    tokens.append(token)
            ix = 0
            candidate_sentence = []
            # transform dataset faults
            while ix < len(tokens):
                found = False
                for r in range(5, 0, -1):
                    multi_tokens = " ".join(tokens[ix:ix + r])
                    if multi_tokens in name_transformations:
                        candidate_sentence += name_transformations[
                            multi_tokens]
                        found = True
                        ix += r
                        break

                if not found:
                    candidate_sentence.append(tokens[ix])
                    ix += 1
            if (words_limit is not None) and (
                    len(result) + len(candidate_sentence) > words_limit):
                break
            else:
                result += candidate_sentence

        return result
Beispiel #7
0
 def look(self, command):
     obj = command.direct_object
     # speical case "look at object"
     if command.preposition == Preposition.AT and command.indirect_object:
         obj = command.indirect_object
     # check if the object was named but not found
     if not obj and command.direct_object_str:
         self.tell('There is no {name} here.'.format(
             name=command.direct_object_str))
         return
     # look at the object
     if not obj or obj == self.location:
         self.location.look(self)
     elif obj == self:
         self.tell(self.description or 'You see nothing special.')
         thing_names = [o.name for o in self.things]
         if thing_names:
             self.tell('You have {names}.'.format(
                 names=join_strings(thing_names, 'and')))
     else:
         self.tell(obj.description or 'You see nothing special.')
Beispiel #8
0
def ngram(n, string, minlen=3, maxlen=25):
    return tlz.pipe(string, ngram_tuples(n, minlen=minlen, maxlen=maxlen),
                    map_c(utils.join_strings("_")))
Beispiel #9
0
def ngram(n, string, minlen=3, maxlen=25):
    return tlz.pipe(string,
                    ngram_tuples(n, minlen=minlen, maxlen=maxlen),
                    map_c(utils.join_strings("_")))