def __init__(self, document, pipeline): self._document = document self._processors = pipeline["tools"]["stanfordcorenlp"]["processors"] self._lang = pipeline["lang"] self.nlp = StanfordCoreNLP(self.PATH_OR_HOST)
def parse_each_sent(worker_id, refs): parser = StanfordCoreNLP(parser_path) parse_result = {} for ref in refs: for sent in ref['sentences']: sent_id = sent['sent_id'] to_be_parse = sent['sent'] parse_result[sent_id] = parser.raw_parse(to_be_parse)['sentences'][0] print 'mpId_%s, refId_%s, sentId_%s done.' % (worker_id, ref['ref_id'], sent['sent_id']) with open(osp.join(tmp_folder, 'parse_result_'+str(worker_id)+'.p'), 'w') as outfile: pickle.dump(parse_result, outfile)
class Plugin(IPlugin): from os import path PATH_OR_HOST = path.abspath(path.dirname(__file__)) + "/resources" def __init__(self, document, pipeline): self._document = document self._processors = pipeline["tools"]["stanfordcorenlp"]["processors"] self._lang = pipeline["lang"] self.nlp = StanfordCoreNLP(self.PATH_OR_HOST) def run(self): from deepnlpf.core.boost import Boost doc = Boost().multithreading(self.wrapper, self._document) self.nlp.close() return doc def wrapper(self, sentence): """ @param annotators : more: https://stanfordnlp.github.io/CoreNLP/annotators.html @param pipelineLanguage : en, zh, ar, fr, de, es @param outputFormat : json, xml, text, more: https://stanfordnlp.github.io/CoreNLP/human-languages.html @param memory : 8g """ props = { "timeout": "1500000", "annotators": ", ".join(self._processors), "pipelineLanguage": "en", "outputFormat": "json", } return self.nlp.annotate(sentence, properties=props) def out_format(self, doc): pass
# self.r5 = ['none'] if len(self.r5) == 0 else self.r5 # self.r6 = ['none'] if len(self.r6) == 0 else self.r6 # self.r7 = ['none'] if len(self.r7) == 0 else self.r7 # # # left words -> r8 # left_wds = [word[0] for word in self.leftWords()] # self.r8 = ['none'] if len(left_wds) == 0 else left_wds # # return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8} if __name__ == '__main__': import sys from pprint import pprint import os.path as osp ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit') sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils')) from corenlp.corenlp import StanfordCoreNLP parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp', 'stanford-corenlp-full-2015-01-30') stanfordParser = StanfordCoreNLP(parser_path) sent = 'woman in red shirt' parse = stanfordParser.raw_parse(sent)['sentences'][0] pprint(parse['dependencies']) attParser = ClefParser() attParser.reset(parse) pprint(attParser.decompose()) pprint(attParser.leftWords())