Beispiel #1
0
    def parse(self, text, relationships=None, dependencies=None):
        """Tokenize and parse some text to create ``Sentence`` objects and extract 
        dependencies, parse trees, etc.

        :param Sentence sentence: The ``Sentence`` object.

        """

        start_time = datetime.now()
        parsed = self.parse_with_error_handling(text)
        end_time = datetime.now()

        # If the parse was unsuccessful, exit
        if parsed == None:
            return []
        # timing report
        parsetime = end_time - start_time
        self.parsetime += parsetime.total_seconds()
        
        sentences = []

        for parsed_sentence in parsed['sentences']:
            sentence = Sentence(text = parsed_sentence['text'], project=self.project)
            sentence.save(False)

            self.add_words(sentence, parsed_sentence, text)
            self.add_grammatical_relations(sentence, parsed_sentence, relationships, dependencies)

            sentence.save(False)            
            sentences.append(sentence)
            
        return sentences
Beispiel #2
0
    def parse(self, text, relationships=None, dependencies=None):
        """Tokenize and parse some text to create ``Sentence`` objects and extract 
        dependencies, parse trees, etc.

        :param Sentence sentence: The ``Sentence`` object.

        """

        start_time = datetime.now()
        parsed = self.parse_with_error_handling(text)
        end_time = datetime.now()

        # If the parse was unsuccessful, exit
        if parsed == None:
            return []
        # timing report
        parsetime = end_time - start_time
        self.parsetime += parsetime.total_seconds()

        sentences = []

        for parsed_sentence in parsed['sentences']:
            sentence = Sentence(text=parsed_sentence['text'],
                                project=self.project)
            sentence.save(False)

            self.add_words(sentence, parsed_sentence, text)
            self.add_grammatical_relations(sentence, parsed_sentence,
                                           relationships, dependencies)

            sentence.save(False)
            sentences.append(sentence)

        return sentences
    def test_process(self):
        """Test process()
        """
        document = Document()
        sentence = Sentence(text="The quick brown fox jumped over the lazy dog",
            document=document, project = self.project)
        words = [
            Word(lemma="the", surface="the"),
            Word(lemma="fox", surface="fox"),
            Word(lemma="jump", surface="jumped"),
            Word(lemma="over", surface="over"),
            Word(lemma="the", surface="the"),
            Word(lemma="dog", surface="dog")]
        for index, word in enumerate(words): 
            word.save()
            sentence.add_word(word, index+1, " ", word.surface, self.project)
        sentence.save()

        result = self.seq_proc.process(sentence)
        sequences = split_sequences(result)
        sequence_sequences = get_sequence_text(sequences)

        # Create four lists of sequences based on the categories and then
        # check the output
        key = {
            "words": {
                "stops": [
                    "the",
                    "the fox",
                    "the fox jumped",
                    "the fox jumped over",
                    "fox jumped over",
                    "fox jumped over the",
                    "jumped over",
                    "jumped over the",
                    "jumped over the dog",
                    "over",
                    "over the",
                    "over the dog",
                    "the",
                    "the dog"],
                "nostops": [
                    "fox",
                    "fox jumped",
                    "jumped",
                    "jumped dog",
                    "dog"]
            },
            "lemmas": {
                "stops": [
                    "the",
                    "the fox",
                    "the fox jump",
                    "the fox jump over",
                    "fox jump over",
                    "fox jump over the",
                    "jump over",
                    "jump over the",
                    "jump over the dog",
                    "over",
                    "over the",
                    "over the dog",
                    "the",
                    "the dog"],
                "nostops": [
                    "fox",
                    "fox jump",
                    "jump",
                    "jump dog",
                    "dog"]
            }
        }

        print sequence_sequences
        # TODO: the seqproc isn't making phrases of words separated by a stopword,
        # but this code expects it to.
        self.failUnless(sequence_sequences == key)
def tokenize_from_raw(parsed_text, txt, project):
    """Given the output of a call to raw_parse, produce a list of Sentences
    and find the PoS, lemmas, and space_befores of each word in each sentence.

    This method does the same thing as tokenize(), but it accepts already parsed
    data.

    :param dict parsed_text: The return value of a call to raw_parse
    :param str txt: The original text.
    :return list: A list of document.Sentence objects.
    """

    # If parsed_text is the result of a failed parse, return with an empty list
    if not parsed_text:
        return []

    paragraph = [] # a list of Sentences
    words = dict()

    count = 0
    sentence_count = len(parsed_text["sentences"])

    for sentence_data in parsed_text["sentences"]:
        sentence = Sentence(text = sentence_data["text"])
        position = 0

        for word_data in sentence_data["words"]:
            word = word_data[0]
            part_of_speech = word_data[1]["PartOfSpeech"]
            lemma = word_data[1]["Lemma"]

            key = (word, part_of_speech, lemma)

            space_before = " "

            try:
                if txt[int(word_data[1]["CharacterOffsetBegin"]) - 1] != " ":
                    space_before = ""
            except IndexError:
                pass

            if key in words.keys():
                word = words[key]

            else:
                try:
                    word = Word.query.filter_by(
                        word = word,
                        lemma = lemma,
                        part_of_speech = part_of_speech
                    ).one()
                except(MultipleResultsFound):
                    project_logger.warning("Duplicate records found for: %s",
                        str(key))
                except(NoResultFound):
                    word = Word(
                        word = word,
                        lemma = lemma,
                        part_of_speech = part_of_speech
                    )
                    # print("New word " + str(word))

                words[key] = word

            sentence.add_word(
                word = word,
                position = position,
                space_before = space_before, # word["space_before"],
                part_of_speech = word.part_of_speech,
                project = project,
                force=False
            )

            position += 1

        paragraph.append(sentence)

        count += 1

        # NOTE: it seems the word dictionary can overload memory sometimes, so
        # this is in place to prevent it.
        # TODO: make the 50 here and in documentparser a config
        if count % 50 == 0 or count == sentence_count:
            db.session.commit()
            words = dict()

    db.session.commit()
    return paragraph