def dependency_relation2(f2): dep_parser = StanfordDependencyParser(os.environ['STANFORD_PARSER'], os.environ['STANFORD_MODELS'], model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") sentences = [] for read in sent_tokenize(f2): read = pos_tag(read.split()) sentences.append(read) parse = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((sentences))], []) parse = [item for sublist in parse for item in sublist] return parse
class StanfordNLTKWrapper: def __init__(self, config_file_path='aida_event/config/xmie.json'): self._config = read_dict_from_json_file(config_file_path) self._domain_name = self._config['common_tools']['stanford_url'] self._port_number = self._config['common_tools']['stanford_port'] self._pos_model = self._config['common_tools']['stanford_pos_model'] self._pos_jar = self._config['common_tools']['stanford_pos_jar'] self._parser_model = self._config['common_tools'][ 'stanford_parser_model'] self._parser_jar = self._config['common_tools']['stanford_parser_jar'] self._core_nlp_parser = CoreNLPParser( url='%s:%s' % (self._domain_name, self._port_number)) self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model, path_to_jar=self._pos_jar) self._dep_parser = StanfordDependencyParser( path_to_jar=self._parser_jar, path_to_models_jar=self._parser_model, java_options='-Xmx16G') def tokenizer(self, input_text): return list(self._core_nlp_parser.tokenize(input_text)) def pos_tag(self, input_tokenized_sentence): return self._pos_tagger.tag(input_tokenized_sentence) def pos_tag_sentences(self, input_tokenized_sentences): return self._pos_tagger.tag_sents(input_tokenized_sentences) def dependency_parser(self, input_tokenized_pos_tagged_sentence): return self._dep_parser.tagged_parse( input_tokenized_pos_tagged_sentence) def dependency_parser_sentences(self, input_tokenized_pos_tagged_sentences): return self._dep_parser.tagged_parse_sents( input_tokenized_pos_tagged_sentences)
def main(input_file, output_file, language): sys.stderr.write("Starting to process text in file {0} at {1}\n".format( input_file.name, str(datetime.datetime.now()))) text = input_file.read() print "============= Raw text: =============" print text script_path = os.path.dirname( os.path.realpath(__file__) ) #needed to figure out the path of dependencies when executing from different directories #this example is only for English (as ... NLTK has no models for Latvian) if language == "en": #First, we perform sentence breaking. sentences = sent_tokenize(text.strip(), language='english') print "============= Sentences: ============" print sentences #Then, we perform tokenization. tokens = [word_tokenize(s, language='english') for s in sentences] print "============== Tokens: ==============" print tokens #In some cases (e.g., for indexing and search related tasks) it may be enough to perform stemming of the text. #This is, however, not needed nor for tagging, nor for parsing. It is included only as an example. stemmer = PorterStemmer(mode='NLTK_EXTENSIONS') stemmed_data = [[stemmer.stem(t) for t in s] for s in tokens] print "========== Stemmed tokens: ==========" print stemmed_data #Then, we execute the Stanford log linear (maximum entropy-based) part-of-speech tagger tagger_jar = os.path.join(script_path, "dependencies", "stanford-postagger-2016-10-31", "stanford-postagger.jar") tagger_model = os.path.join(script_path, "dependencies", "stanford-postagger-2016-10-31", "models", "english-bidirectional-distsim.tagger") pos_tagger = StanfordPOSTagger(tagger_model, tagger_jar, encoding='utf8') tagged_data = [pos_tagger.tag(s) for s in tokens] print "========= Tagged sentences: =========" print tagged_data #When the data is tagged, we perform syntactic parsing using the Stanford parser. parser_jar = os.path.join(script_path, "dependencies", "stanford-parser-full-2016-10-31", "stanford-parser.jar") parser_model = os.path.join(script_path, "dependencies", "stanford-parser-full-2016-10-31", "stanford-parser-3.7.0-models.jar") parser = StanfordDependencyParser( model_path= "edu/stanford/nlp/models/lexparser/englishFactored.ser.gz", path_to_models_jar=parser_model, path_to_jar=parser_jar) parsed_data = parser.tagged_parse_sents(tagged_data) #Finally, we print the result to the output file. #Note that the Stanford parser deleted all punctuation marks and the data also lacks lemmas. #There is a way to get them back - create a class that inherits from StanfordDependencyParser and add "-outputFormatOptions includePunctuationDependencies" to the cmd that executes the parser. #... or use the Stanford Neural Dependency Parser instead! #For the example, I did not want to overly complicate the code. print "========= Parsed sentences: =========" for parsed_sentence in parsed_data: for dependency_graph in parsed_sentence: output_file.write(dependency_graph.to_conll(10)) print dependency_graph.to_conll(10) output_file.write("\n") sys.stderr.write("... processing completed at {0}\n".format( str(datetime.datetime.now())))