def parse(self, sentence, relationships=None, dependencies=None,
            max_length=30):
        """Parse a ``Sentence`` and extract dependencies, parse trees, etc.

        Note that for max_length, a "word" is defined as something with a space
        on at least one side. This is not the typical definition of "word".
        This is done so that length can be checked before resources are
        committed to processing a very long sentence.

        :param Sentence sentence: The ``Sentence`` object.
        :param int max_length: The most amount of words to process.
        """

        parsed = self.parse_with_error_handling(sentence.text)

        # If the parse was unsuccessful, exit
        if parsed == None:
            return

        parsed_sentence = parsed["sentences"][0]

        if len(parsed["sentences"]) > 1:
            project_logger.warning("More than one sentence passed in to"
                " StringProcessor.parse().")
            parsed_sentence["text"] += parsed["sentences"][1]["text"]

        for dependency in parsed_sentence["dependencies"]:
            # We don't want to make a dependency involving ROOT
            if int(dependency[2]) > 0 and int(dependency[4]) > 0:
                governor = dependency[1]
                dependent = dependency[3]
                governor_index = int(dependency[2]) - 1
                dependent_index = int(dependency[4]) - 1
                governor_pos = parsed_sentence["words"][governor_index][1]\
                    ["PartOfSpeech"]
                governor_lemma = parsed_sentence["words"][governor_index][1]\
                    ["Lemma"]
                dependent_pos = parsed_sentence["words"][dependent_index][1]\
                    ["PartOfSpeech"]
                dependent_lemma = parsed_sentence["words"][dependent_index][1]\
                    ["Lemma"]
                grammatical_relationship = dependency[0]

                # If dictionaries are present, run with duplication handling
                if relationships != None and dependencies != None:
                    key = grammatical_relationship

                    if key in relationships.keys():
                        relationship = relationships[key]
                    else:

                        try:
                            relationship = GrammaticalRelationship.query.\
                                filter_by(name = grammatical_relationship).\
                                one()
                        except(MultipleResultsFound):
                            project_logger.error("duplicate records found "
                                "for: %s", str(key))
                        except(NoResultFound):
                            relationship = GrammaticalRelationship(
                                name = grammatical_relationship)

                        relationships[key] = relationship

                    # Read the data for the governor, and find the
                    # corresponding word
                    governor = Word.query.filter_by(
                        word = governor,
                        lemma = governor_lemma,
                        part_of_speech = governor_pos
                    ).first()

                    # Same as above for the dependent in the relationship
                    dependent = Word.query.filter_by(
                        word = dependent,
                        lemma = dependent_lemma,
                        part_of_speech = dependent_pos
                    ).first()

                    try:
                        governor.id
                        dependent.id
                    except:
                        project_logger.error("Governor or dependent not "
                            "found; giving up on parse. This likely indicates"
                            " an error in the preprocessing; rerunning the "
                            "preprocessor is recommended.")
                        project_logger.info(sentence)
                        return sentence

                    key = (relationship.name, governor.id, dependent.id)

                    if key in dependencies.keys():
                        dependency = dependencies[key]
                    else:

                        try:
                            dependency = Dependency.query.filter_by(
                                grammatical_relationship = relationship,
                                governor = governor,
                                dependent = dependent
                            ).one()
                        except(MultipleResultsFound):
                            self.logg_error(("duplicate records found for: %s",
                                str(key)))
                        except(NoResultFound):
                            dependency = Dependency(
                                grammatical_relationship = relationship,
                                governor = governor,
                                dependent = dependent
                            )

                        dependencies[key] = dependency

                    # Add the dependency to the sentence
                    sentence.add_dependency(
                        dependency = dependency,
                        governor_index = governor_index,
                        dependent_index = dependent_index,
                        project = self.project,
                        force = False
                    )

                    dependency.save(False)

                else:
                    # TODO: fill
                    pass

        return sentence
    def add_grammatical_relations(self, sentence, parsed_sentence, relationships, dependencies):
        
        for dependency in parsed_sentence["dependencies"]:
            # We don't want to make a dependency involving ROOT
            if int(dependency[2]) > 0 and int(dependency[4]) > 0:
                governor = dependency[1]
                dependent = dependency[3]
                governor_index = int(dependency[2]) - 1
                dependent_index = int(dependency[4]) - 1
                governor_pos = parsed_sentence["words"][governor_index][1]\
                    ["PartOfSpeech"]
                try:
                    governor_lemma = parsed_sentence["words"][governor_index][1]\
                        ["Lemma"].lower()
                except AttributeError:
                    # this word wasn't recognized as a word by the parser,
                    # it's probably a weird character or something
                    governor_lemma = "*" * (int(parsed_sentence["words"][governor_index][1]["CharacterOffsetEnd"]) - int(parsed_sentence["words"][governor_index][1]["CharacterOffsetBegin"]))
                    governor = governor_lemma[:]
                dependent_pos = parsed_sentence["words"][dependent_index][1]\
                    ["PartOfSpeech"]
                try:
                    dependent_lemma = parsed_sentence["words"][dependent_index][1]\
                        ["Lemma"].lower()
                except AttributeError:
                    # this word wasn't recognized as a word by the parser,
                    # it's probably a weird character or something
                    dependent_lemma = "*" * (int(parsed_sentence["words"][dependent_index][1]["CharacterOffsetEnd"]) - int(parsed_sentence["words"][dependent_index][1]["CharacterOffsetBegin"]))
                    dependent = dependent_lemma[:]
                grammatical_relationship = dependency[0]

                # If dictionaries are present, run with duplication handling
                if relationships != None and dependencies != None:
                    key = grammatical_relationship

                    if key in relationships.keys():
                        relationship = relationships[key]
                    else:

                        try:
                            relationship = GrammaticalRelationship.query.\
                                filter_by(name=grammatical_relationship,
                                          project=self.project).one()
                        except MultipleResultsFound:
                            project_logger.error("duplicate records found "
                                                 "for: %s", str(key))
                        except NoResultFound:
                            relationship = GrammaticalRelationship(
                                name=grammatical_relationship,
                                project=self.project)

                        relationships[key] = relationship

                    # Read the data for the governor, and find the
                    # corresponding word
                    governor = Word.query.filter_by(
                        lemma=governor_lemma,
                        surface=governor.lower(),
                        part_of_speech=governor_pos).first()

                    # Same as above for the dependent in the relationship
                    dependent = Word.query.filter_by(
                        lemma=dependent_lemma,
                        surface=dependent.lower(),
                        part_of_speech=dependent_pos).first()

                    try:
                        governor.id
                        dependent.id
                    except:
                        project_logger.error(
                            "Governor or dependent not "
                            "found; giving up on parse. This likely indicates "
                            "an error in the preprocessing; rerunning the "
                            "preprocessor is recommended.")
                        project_logger.info(sentence.text)
                        
                        return #die

                    key = (relationship.name, governor.id, dependent.id)

                    if key in dependencies.keys():
                        dependency = dependencies[key]
                    else:

                        try:
                            dependency = Dependency.query.filter_by(
                                grammatical_relationship=relationship,
                                governor=governor,
                                dependent=dependent
                            ).one()
                        except MultipleResultsFound:
                            project_logger.error("duplicate records found for: %s",
                                                 str(key))
                        except NoResultFound:
                            dependency = Dependency(
                                grammatical_relationship=relationship,
                                governor=governor,
                                dependent=dependent
                            )

                        dependencies[key] = dependency

                    # Add the dependency to the sentence
                    sentence.add_dependency(
                        dependency=dependency,
                        governor_index=governor_index,
                        dependent_index=dependent_index,
                        project=self.project,
                        force=False
                    )

                    dependency.save(False)

                else:
                    # TODO: fill
                    pass
                    
        db.session.commit() 
Exemple #3
0
    def add_grammatical_relations(self, sentence, parsed_sentence,
                                  relationships, dependencies):

        for dependency in parsed_sentence["dependencies"]:
            # We don't want to make a dependency involving ROOT
            if int(dependency[2]) > 0 and int(dependency[4]) > 0:
                governor = dependency[1]
                dependent = dependency[3]
                governor_index = int(dependency[2]) - 1
                dependent_index = int(dependency[4]) - 1
                governor_pos = parsed_sentence["words"][governor_index][1]\
                    ["PartOfSpeech"]
                try:
                    governor_lemma = parsed_sentence["words"][governor_index][1]\
                        ["Lemma"].lower()
                except AttributeError:
                    # this word wasn't recognized as a word by the parser,
                    # it's probably a weird character or something
                    governor_lemma = "*" * (
                        int(parsed_sentence["words"][governor_index][1]
                            ["CharacterOffsetEnd"]) -
                        int(parsed_sentence["words"][governor_index][1]
                            ["CharacterOffsetBegin"]))
                    governor = governor_lemma[:]
                dependent_pos = parsed_sentence["words"][dependent_index][1]\
                    ["PartOfSpeech"]
                try:
                    dependent_lemma = parsed_sentence["words"][dependent_index][1]\
                        ["Lemma"].lower()
                except AttributeError:
                    # this word wasn't recognized as a word by the parser,
                    # it's probably a weird character or something
                    dependent_lemma = "*" * (
                        int(parsed_sentence["words"][dependent_index][1]
                            ["CharacterOffsetEnd"]) -
                        int(parsed_sentence["words"][dependent_index][1]
                            ["CharacterOffsetBegin"]))
                    dependent = dependent_lemma[:]
                grammatical_relationship = dependency[0]

                # If dictionaries are present, run with duplication handling
                if relationships != None and dependencies != None:
                    key = grammatical_relationship

                    if key in relationships.keys():
                        relationship = relationships[key]
                    else:

                        try:
                            relationship = GrammaticalRelationship.query.\
                                filter_by(name=grammatical_relationship,
                                          project=self.project).one()
                        except MultipleResultsFound:
                            project_logger.error(
                                "duplicate records found "
                                "for: %s", str(key))
                        except NoResultFound:
                            relationship = GrammaticalRelationship(
                                name=grammatical_relationship,
                                project=self.project)

                        relationships[key] = relationship

                    # Read the data for the governor, and find the
                    # corresponding word
                    governor = Word.query.filter_by(
                        lemma=governor_lemma,
                        surface=governor.lower(),
                        part_of_speech=governor_pos).first()

                    # Same as above for the dependent in the relationship
                    dependent = Word.query.filter_by(
                        lemma=dependent_lemma,
                        surface=dependent.lower(),
                        part_of_speech=dependent_pos).first()

                    try:
                        governor.id
                        dependent.id
                    except:
                        project_logger.error(
                            "Governor or dependent not "
                            "found; giving up on parse. This likely indicates "
                            "an error in the preprocessing; rerunning the "
                            "preprocessor is recommended.")
                        project_logger.error("Failed to process : %s",
                                             sentence.text)

                        return  #die

                    key = (relationship.name, governor.id, dependent.id)

                    if key in dependencies.keys():
                        dependency = dependencies[key]
                    else:

                        try:
                            dependency = Dependency.query.filter_by(
                                grammatical_relationship=relationship,
                                governor=governor,
                                dependent=dependent).one()
                        except MultipleResultsFound:
                            project_logger.error(
                                "duplicate records found for: %s", str(key))
                        except NoResultFound:
                            dependency = Dependency(
                                grammatical_relationship=relationship,
                                governor=governor,
                                dependent=dependent)

                        dependencies[key] = dependency

                    # Add the dependency to the sentence
                    sentence.add_dependency(dependency=dependency,
                                            governor_index=governor_index,
                                            dependent_index=dependent_index,
                                            project=self.project,
                                            force=False)

                    dependency.save(False)

                else:
                    # TODO: fill
                    pass

        db.session.commit()