Beispiel #1
0
def __preprocess_sentence(sentence):
    """
    Preprocesses a sentence for the grammar parser. Words are concatenated with their adjoining punctuation marks. This method provides heuristics to recognize those entities prior to processing.
    
    :param str sentence: The sentence to preprocess.
    
    :rtype list(str): The list of proper nouns in the pre-processed sentence and fillers to replace removed, recognized entities without altering the grammatical structure.
    """
    split_sentence = re.split(r"\s+", sentence) 
    list_indicies = utils.comma_delimited_list_indicies(split_sentence)
    if list_indicies: 
        sentences = utils.enumerate_sentence_with_list(split_sentence, list_indicies)
       
    return None
Beispiel #2
0
    def test_enumerate_sentence_with_list(self):
        # first_indicies = utils.comma_delimited_list_indicies( self.sentence_with_list )
        second_indicies = utils.comma_delimited_list_indicies(self.sentence_with_two_lists)

        # print utils.enumerate_sentence_with_list( self.sentence_with_list, first_indicies )
        sens = utils.enumerate_sentence_with_list(self.sentence_with_two_lists, second_indicies)

        eq_(
            sens[0],
            [
                "This",
                "is",
                "a",
                "sentence",
                "listing",
                "vestiges",
                "and",
                "boolean,",
                "but",
                "it",
                "also",
                "lists",
                "the",
                "hats",
                "and",
                "gloves.",
            ],
        )
        eq_(
            sens[1],
            [
                "This",
                "is",
                "a",
                "sentence",
                "listing",
                "aardvarks",
                "and",
                "boolean,",
                "but",
                "it",
                "also",
                "lists",
                "the",
                "shoes",
                "and",
                "gloves.",
            ],
        )
        eq_(
            sens[2],
            [
                "This",
                "is",
                "a",
                "sentence",
                "listing",
                "igloos",
                "and",
                "boolean,",
                "but",
                "it",
                "also",
                "lists",
                "the",
                "carribean",
                "and",
                "gloves.",
            ],
        )