Example #1
0
 def dependency_relation2(f2):
     dep_parser = StanfordDependencyParser(os.environ['STANFORD_PARSER'], os.environ['STANFORD_MODELS'],
                                           model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
     sentences = []
     for read in sent_tokenize(f2):
         read = pos_tag(read.split())
         sentences.append(read)
     parse = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in
                  dep_parser.tagged_parse_sents((sentences))], [])
     parse = [item for sublist in parse for item in sublist]
     return parse
Example #2
0
class StanfordNLTKWrapper:
    def __init__(self, config_file_path='aida_event/config/xmie.json'):
        self._config = read_dict_from_json_file(config_file_path)
        self._domain_name = self._config['common_tools']['stanford_url']
        self._port_number = self._config['common_tools']['stanford_port']
        self._pos_model = self._config['common_tools']['stanford_pos_model']
        self._pos_jar = self._config['common_tools']['stanford_pos_jar']
        self._parser_model = self._config['common_tools'][
            'stanford_parser_model']
        self._parser_jar = self._config['common_tools']['stanford_parser_jar']

        self._core_nlp_parser = CoreNLPParser(
            url='%s:%s' % (self._domain_name, self._port_number))
        self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model,
                                             path_to_jar=self._pos_jar)
        self._dep_parser = StanfordDependencyParser(
            path_to_jar=self._parser_jar,
            path_to_models_jar=self._parser_model,
            java_options='-Xmx16G')

    def tokenizer(self, input_text):
        return list(self._core_nlp_parser.tokenize(input_text))

    def pos_tag(self, input_tokenized_sentence):
        return self._pos_tagger.tag(input_tokenized_sentence)

    def pos_tag_sentences(self, input_tokenized_sentences):
        return self._pos_tagger.tag_sents(input_tokenized_sentences)

    def dependency_parser(self, input_tokenized_pos_tagged_sentence):
        return self._dep_parser.tagged_parse(
            input_tokenized_pos_tagged_sentence)

    def dependency_parser_sentences(self,
                                    input_tokenized_pos_tagged_sentences):
        return self._dep_parser.tagged_parse_sents(
            input_tokenized_pos_tagged_sentences)
def main(input_file, output_file, language):
    sys.stderr.write("Starting to process text in file {0} at {1}\n".format(
        input_file.name, str(datetime.datetime.now())))
    text = input_file.read()
    print "============= Raw text: ============="
    print text

    script_path = os.path.dirname(
        os.path.realpath(__file__)
    )  #needed to figure out the path of dependencies when executing from different directories

    #this example is only for English (as ... NLTK has no models for Latvian)
    if language == "en":
        #First, we perform sentence breaking.
        sentences = sent_tokenize(text.strip(), language='english')
        print "============= Sentences: ============"
        print sentences
        #Then, we perform tokenization.
        tokens = [word_tokenize(s, language='english') for s in sentences]
        print "============== Tokens: =============="
        print tokens
        #In some cases (e.g., for indexing and search related tasks) it may be enough to perform stemming of the text.
        #This is, however, not needed nor for tagging, nor for parsing. It is included only as an example.
        stemmer = PorterStemmer(mode='NLTK_EXTENSIONS')
        stemmed_data = [[stemmer.stem(t) for t in s] for s in tokens]
        print "========== Stemmed tokens: =========="
        print stemmed_data
        #Then, we execute the Stanford log linear (maximum entropy-based) part-of-speech tagger
        tagger_jar = os.path.join(script_path, "dependencies",
                                  "stanford-postagger-2016-10-31",
                                  "stanford-postagger.jar")
        tagger_model = os.path.join(script_path, "dependencies",
                                    "stanford-postagger-2016-10-31", "models",
                                    "english-bidirectional-distsim.tagger")
        pos_tagger = StanfordPOSTagger(tagger_model,
                                       tagger_jar,
                                       encoding='utf8')
        tagged_data = [pos_tagger.tag(s) for s in tokens]
        print "========= Tagged sentences: ========="
        print tagged_data
        #When the data is tagged, we perform syntactic parsing using the Stanford parser.
        parser_jar = os.path.join(script_path, "dependencies",
                                  "stanford-parser-full-2016-10-31",
                                  "stanford-parser.jar")
        parser_model = os.path.join(script_path, "dependencies",
                                    "stanford-parser-full-2016-10-31",
                                    "stanford-parser-3.7.0-models.jar")
        parser = StanfordDependencyParser(
            model_path=
            "edu/stanford/nlp/models/lexparser/englishFactored.ser.gz",
            path_to_models_jar=parser_model,
            path_to_jar=parser_jar)
        parsed_data = parser.tagged_parse_sents(tagged_data)
        #Finally, we print the result to the output file.
        #Note that the Stanford parser deleted all punctuation marks and the data also lacks lemmas.
        #There is a way to get them back - create a class that inherits from StanfordDependencyParser and add "-outputFormatOptions includePunctuationDependencies" to the cmd that executes the parser.
        #... or use the Stanford Neural Dependency Parser instead!
        #For the example, I did not want to overly complicate the code.
        print "========= Parsed sentences: ========="
        for parsed_sentence in parsed_data:
            for dependency_graph in parsed_sentence:
                output_file.write(dependency_graph.to_conll(10))
                print dependency_graph.to_conll(10)
            output_file.write("\n")

    sys.stderr.write("... processing completed at {0}\n".format(
        str(datetime.datetime.now())))