def __preprocess_sentence(sentence): """ Preprocesses a sentence for the grammar parser. Words are concatenated with their adjoining punctuation marks. This method provides heuristics to recognize those entities prior to processing. :param str sentence: The sentence to preprocess. :rtype list(str): The list of proper nouns in the pre-processed sentence and fillers to replace removed, recognized entities without altering the grammatical structure. """ split_sentence = re.split(r"\s+", sentence) list_indicies = utils.comma_delimited_list_indicies(split_sentence) if list_indicies: sentences = utils.enumerate_sentence_with_list(split_sentence, list_indicies) return None
def test_enumerate_sentence_with_list(self): # first_indicies = utils.comma_delimited_list_indicies( self.sentence_with_list ) second_indicies = utils.comma_delimited_list_indicies(self.sentence_with_two_lists) # print utils.enumerate_sentence_with_list( self.sentence_with_list, first_indicies ) sens = utils.enumerate_sentence_with_list(self.sentence_with_two_lists, second_indicies) eq_( sens[0], [ "This", "is", "a", "sentence", "listing", "vestiges", "and", "boolean,", "but", "it", "also", "lists", "the", "hats", "and", "gloves.", ], ) eq_( sens[1], [ "This", "is", "a", "sentence", "listing", "aardvarks", "and", "boolean,", "but", "it", "also", "lists", "the", "shoes", "and", "gloves.", ], ) eq_( sens[2], [ "This", "is", "a", "sentence", "listing", "igloos", "and", "boolean,", "but", "it", "also", "lists", "the", "carribean", "and", "gloves.", ], )
def test_comma_delimited_list_indicies(self): first_indicies = utils.comma_delimited_list_indicies(self.sentence_with_list) second_indicies = utils.comma_delimited_list_indicies(self.sentence_with_two_lists) eq_(first_indicies[0], set([5, 6, 7])) eq_(second_indicies[0], set([5, 6, 7])) eq_(second_indicies[1], set([16, 17, 15]))