def parse(self, sentence, relationships=None, dependencies=None, max_length=30): """Parse a ``Sentence`` and extract dependencies, parse trees, etc. Note that for max_length, a "word" is defined as something with a space on at least one side. This is not the typical definition of "word". This is done so that length can be checked before resources are committed to processing a very long sentence. :param Sentence sentence: The ``Sentence`` object. :param int max_length: The most amount of words to process. """ parsed = self.parse_with_error_handling(sentence.text) # If the parse was unsuccessful, exit if parsed == None: return parsed_sentence = parsed["sentences"][0] if len(parsed["sentences"]) > 1: project_logger.warning("More than one sentence passed in to" " StringProcessor.parse().") parsed_sentence["text"] += parsed["sentences"][1]["text"] for dependency in parsed_sentence["dependencies"]: # We don't want to make a dependency involving ROOT if int(dependency[2]) > 0 and int(dependency[4]) > 0: governor = dependency[1] dependent = dependency[3] governor_index = int(dependency[2]) - 1 dependent_index = int(dependency[4]) - 1 governor_pos = parsed_sentence["words"][governor_index][1]\ ["PartOfSpeech"] governor_lemma = parsed_sentence["words"][governor_index][1]\ ["Lemma"] dependent_pos = parsed_sentence["words"][dependent_index][1]\ ["PartOfSpeech"] dependent_lemma = parsed_sentence["words"][dependent_index][1]\ ["Lemma"] grammatical_relationship = dependency[0] # If dictionaries are present, run with duplication handling if relationships != None and dependencies != None: key = grammatical_relationship if key in relationships.keys(): relationship = relationships[key] else: try: relationship = GrammaticalRelationship.query.\ filter_by(name = grammatical_relationship).\ one() except(MultipleResultsFound): project_logger.error("duplicate records found " "for: %s", str(key)) except(NoResultFound): relationship = GrammaticalRelationship( name = grammatical_relationship) relationships[key] = relationship # Read the data for the governor, and find the # corresponding word governor = Word.query.filter_by( word = governor, lemma = governor_lemma, part_of_speech = governor_pos ).first() # Same as above for the dependent in the relationship dependent = Word.query.filter_by( word = dependent, lemma = dependent_lemma, part_of_speech = dependent_pos ).first() try: governor.id dependent.id except: project_logger.error("Governor or dependent not " "found; giving up on parse. This likely indicates" " an error in the preprocessing; rerunning the " "preprocessor is recommended.") project_logger.info(sentence) return sentence key = (relationship.name, governor.id, dependent.id) if key in dependencies.keys(): dependency = dependencies[key] else: try: dependency = Dependency.query.filter_by( grammatical_relationship = relationship, governor = governor, dependent = dependent ).one() except(MultipleResultsFound): self.logg_error(("duplicate records found for: %s", str(key))) except(NoResultFound): dependency = Dependency( grammatical_relationship = relationship, governor = governor, dependent = dependent ) dependencies[key] = dependency # Add the dependency to the sentence sentence.add_dependency( dependency = dependency, governor_index = governor_index, dependent_index = dependent_index, project = self.project, force = False ) dependency.save(False) else: # TODO: fill pass return sentence
def add_grammatical_relations(self, sentence, parsed_sentence, relationships, dependencies): for dependency in parsed_sentence["dependencies"]: # We don't want to make a dependency involving ROOT if int(dependency[2]) > 0 and int(dependency[4]) > 0: governor = dependency[1] dependent = dependency[3] governor_index = int(dependency[2]) - 1 dependent_index = int(dependency[4]) - 1 governor_pos = parsed_sentence["words"][governor_index][1]\ ["PartOfSpeech"] try: governor_lemma = parsed_sentence["words"][governor_index][1]\ ["Lemma"].lower() except AttributeError: # this word wasn't recognized as a word by the parser, # it's probably a weird character or something governor_lemma = "*" * (int(parsed_sentence["words"][governor_index][1]["CharacterOffsetEnd"]) - int(parsed_sentence["words"][governor_index][1]["CharacterOffsetBegin"])) governor = governor_lemma[:] dependent_pos = parsed_sentence["words"][dependent_index][1]\ ["PartOfSpeech"] try: dependent_lemma = parsed_sentence["words"][dependent_index][1]\ ["Lemma"].lower() except AttributeError: # this word wasn't recognized as a word by the parser, # it's probably a weird character or something dependent_lemma = "*" * (int(parsed_sentence["words"][dependent_index][1]["CharacterOffsetEnd"]) - int(parsed_sentence["words"][dependent_index][1]["CharacterOffsetBegin"])) dependent = dependent_lemma[:] grammatical_relationship = dependency[0] # If dictionaries are present, run with duplication handling if relationships != None and dependencies != None: key = grammatical_relationship if key in relationships.keys(): relationship = relationships[key] else: try: relationship = GrammaticalRelationship.query.\ filter_by(name=grammatical_relationship, project=self.project).one() except MultipleResultsFound: project_logger.error("duplicate records found " "for: %s", str(key)) except NoResultFound: relationship = GrammaticalRelationship( name=grammatical_relationship, project=self.project) relationships[key] = relationship # Read the data for the governor, and find the # corresponding word governor = Word.query.filter_by( lemma=governor_lemma, surface=governor.lower(), part_of_speech=governor_pos).first() # Same as above for the dependent in the relationship dependent = Word.query.filter_by( lemma=dependent_lemma, surface=dependent.lower(), part_of_speech=dependent_pos).first() try: governor.id dependent.id except: project_logger.error( "Governor or dependent not " "found; giving up on parse. This likely indicates " "an error in the preprocessing; rerunning the " "preprocessor is recommended.") project_logger.info(sentence.text) return #die key = (relationship.name, governor.id, dependent.id) if key in dependencies.keys(): dependency = dependencies[key] else: try: dependency = Dependency.query.filter_by( grammatical_relationship=relationship, governor=governor, dependent=dependent ).one() except MultipleResultsFound: project_logger.error("duplicate records found for: %s", str(key)) except NoResultFound: dependency = Dependency( grammatical_relationship=relationship, governor=governor, dependent=dependent ) dependencies[key] = dependency # Add the dependency to the sentence sentence.add_dependency( dependency=dependency, governor_index=governor_index, dependent_index=dependent_index, project=self.project, force=False ) dependency.save(False) else: # TODO: fill pass db.session.commit()
def add_grammatical_relations(self, sentence, parsed_sentence, relationships, dependencies): for dependency in parsed_sentence["dependencies"]: # We don't want to make a dependency involving ROOT if int(dependency[2]) > 0 and int(dependency[4]) > 0: governor = dependency[1] dependent = dependency[3] governor_index = int(dependency[2]) - 1 dependent_index = int(dependency[4]) - 1 governor_pos = parsed_sentence["words"][governor_index][1]\ ["PartOfSpeech"] try: governor_lemma = parsed_sentence["words"][governor_index][1]\ ["Lemma"].lower() except AttributeError: # this word wasn't recognized as a word by the parser, # it's probably a weird character or something governor_lemma = "*" * ( int(parsed_sentence["words"][governor_index][1] ["CharacterOffsetEnd"]) - int(parsed_sentence["words"][governor_index][1] ["CharacterOffsetBegin"])) governor = governor_lemma[:] dependent_pos = parsed_sentence["words"][dependent_index][1]\ ["PartOfSpeech"] try: dependent_lemma = parsed_sentence["words"][dependent_index][1]\ ["Lemma"].lower() except AttributeError: # this word wasn't recognized as a word by the parser, # it's probably a weird character or something dependent_lemma = "*" * ( int(parsed_sentence["words"][dependent_index][1] ["CharacterOffsetEnd"]) - int(parsed_sentence["words"][dependent_index][1] ["CharacterOffsetBegin"])) dependent = dependent_lemma[:] grammatical_relationship = dependency[0] # If dictionaries are present, run with duplication handling if relationships != None and dependencies != None: key = grammatical_relationship if key in relationships.keys(): relationship = relationships[key] else: try: relationship = GrammaticalRelationship.query.\ filter_by(name=grammatical_relationship, project=self.project).one() except MultipleResultsFound: project_logger.error( "duplicate records found " "for: %s", str(key)) except NoResultFound: relationship = GrammaticalRelationship( name=grammatical_relationship, project=self.project) relationships[key] = relationship # Read the data for the governor, and find the # corresponding word governor = Word.query.filter_by( lemma=governor_lemma, surface=governor.lower(), part_of_speech=governor_pos).first() # Same as above for the dependent in the relationship dependent = Word.query.filter_by( lemma=dependent_lemma, surface=dependent.lower(), part_of_speech=dependent_pos).first() try: governor.id dependent.id except: project_logger.error( "Governor or dependent not " "found; giving up on parse. This likely indicates " "an error in the preprocessing; rerunning the " "preprocessor is recommended.") project_logger.error("Failed to process : %s", sentence.text) return #die key = (relationship.name, governor.id, dependent.id) if key in dependencies.keys(): dependency = dependencies[key] else: try: dependency = Dependency.query.filter_by( grammatical_relationship=relationship, governor=governor, dependent=dependent).one() except MultipleResultsFound: project_logger.error( "duplicate records found for: %s", str(key)) except NoResultFound: dependency = Dependency( grammatical_relationship=relationship, governor=governor, dependent=dependent) dependencies[key] = dependency # Add the dependency to the sentence sentence.add_dependency(dependency=dependency, governor_index=governor_index, dependent_index=dependent_index, project=self.project, force=False) dependency.save(False) else: # TODO: fill pass db.session.commit()