def parse(self, text, relationships=None, dependencies=None): """Tokenize and parse some text to create ``Sentence`` objects and extract dependencies, parse trees, etc. :param Sentence sentence: The ``Sentence`` object. """ start_time = datetime.now() parsed = self.parse_with_error_handling(text) end_time = datetime.now() # If the parse was unsuccessful, exit if parsed == None: return [] # timing report parsetime = end_time - start_time self.parsetime += parsetime.total_seconds() sentences = [] for parsed_sentence in parsed['sentences']: sentence = Sentence(text = parsed_sentence['text'], project=self.project) sentence.save(False) self.add_words(sentence, parsed_sentence, text) self.add_grammatical_relations(sentence, parsed_sentence, relationships, dependencies) sentence.save(False) sentences.append(sentence) return sentences
def parse(self, text, relationships=None, dependencies=None): """Tokenize and parse some text to create ``Sentence`` objects and extract dependencies, parse trees, etc. :param Sentence sentence: The ``Sentence`` object. """ start_time = datetime.now() parsed = self.parse_with_error_handling(text) end_time = datetime.now() # If the parse was unsuccessful, exit if parsed == None: return [] # timing report parsetime = end_time - start_time self.parsetime += parsetime.total_seconds() sentences = [] for parsed_sentence in parsed['sentences']: sentence = Sentence(text=parsed_sentence['text'], project=self.project) sentence.save(False) self.add_words(sentence, parsed_sentence, text) self.add_grammatical_relations(sentence, parsed_sentence, relationships, dependencies) sentence.save(False) sentences.append(sentence) return sentences
def test_process(self): """Test process() """ document = Document() sentence = Sentence(text="The quick brown fox jumped over the lazy dog", document=document, project = self.project) words = [ Word(lemma="the", surface="the"), Word(lemma="fox", surface="fox"), Word(lemma="jump", surface="jumped"), Word(lemma="over", surface="over"), Word(lemma="the", surface="the"), Word(lemma="dog", surface="dog")] for index, word in enumerate(words): word.save() sentence.add_word(word, index+1, " ", word.surface, self.project) sentence.save() result = self.seq_proc.process(sentence) sequences = split_sequences(result) sequence_sequences = get_sequence_text(sequences) # Create four lists of sequences based on the categories and then # check the output key = { "words": { "stops": [ "the", "the fox", "the fox jumped", "the fox jumped over", "fox jumped over", "fox jumped over the", "jumped over", "jumped over the", "jumped over the dog", "over", "over the", "over the dog", "the", "the dog"], "nostops": [ "fox", "fox jumped", "jumped", "jumped dog", "dog"] }, "lemmas": { "stops": [ "the", "the fox", "the fox jump", "the fox jump over", "fox jump over", "fox jump over the", "jump over", "jump over the", "jump over the dog", "over", "over the", "over the dog", "the", "the dog"], "nostops": [ "fox", "fox jump", "jump", "jump dog", "dog"] } } print sequence_sequences # TODO: the seqproc isn't making phrases of words separated by a stopword, # but this code expects it to. self.failUnless(sequence_sequences == key)
def tokenize_from_raw(parsed_text, txt, project): """Given the output of a call to raw_parse, produce a list of Sentences and find the PoS, lemmas, and space_befores of each word in each sentence. This method does the same thing as tokenize(), but it accepts already parsed data. :param dict parsed_text: The return value of a call to raw_parse :param str txt: The original text. :return list: A list of document.Sentence objects. """ # If parsed_text is the result of a failed parse, return with an empty list if not parsed_text: return [] paragraph = [] # a list of Sentences words = dict() count = 0 sentence_count = len(parsed_text["sentences"]) for sentence_data in parsed_text["sentences"]: sentence = Sentence(text = sentence_data["text"]) position = 0 for word_data in sentence_data["words"]: word = word_data[0] part_of_speech = word_data[1]["PartOfSpeech"] lemma = word_data[1]["Lemma"] key = (word, part_of_speech, lemma) space_before = " " try: if txt[int(word_data[1]["CharacterOffsetBegin"]) - 1] != " ": space_before = "" except IndexError: pass if key in words.keys(): word = words[key] else: try: word = Word.query.filter_by( word = word, lemma = lemma, part_of_speech = part_of_speech ).one() except(MultipleResultsFound): project_logger.warning("Duplicate records found for: %s", str(key)) except(NoResultFound): word = Word( word = word, lemma = lemma, part_of_speech = part_of_speech ) # print("New word " + str(word)) words[key] = word sentence.add_word( word = word, position = position, space_before = space_before, # word["space_before"], part_of_speech = word.part_of_speech, project = project, force=False ) position += 1 paragraph.append(sentence) count += 1 # NOTE: it seems the word dictionary can overload memory sometimes, so # this is in place to prevent it. # TODO: make the 50 here and in documentparser a config if count % 50 == 0 or count == sentence_count: db.session.commit() words = dict() db.session.commit() return paragraph