class Tokenizer: def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient(annotators=['ssplit']) self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer def tokenize(self, doc: str) -> List[List[Token]]: corenlp_annotation = self.client.annotate(doc) sentences = [] for sentence in corenlp_annotation.sentence: text = doc[sentence.characterOffsetBegin:sentence. characterOffsetEnd] if self.do_lower_case: text = text.lower() offset = sentence.characterOffsetBegin bert_tokens = self.basic_tokenizer.tokenize(text) begin = 0 tokens = [] for bert_token in bert_tokens: word = bert_token begin = text.index(word, begin) end = begin + len(word) tokens.append(Token(word, begin + offset, end + offset)) begin = end if len(tokens) > 0: sentences.append(tokens) return sentences
def get_corenlp_client(corenlp_path, corenlp_port): os.environ["CORENLP_HOME"] = corenlp_path assert not is_port_occupied( corenlp_port), "Port {} is occupied by other process".format( corenlp_port) corenlp_client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'], timeout=60000, memory='5G', endpoint="http://localhost:%d" % corenlp_port, start_server=True, be_quiet=False) corenlp_client.annotate( "hello world", annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'depparse'], output_format="json") return corenlp_client
class StanfordCoreferenceResolver(CoreferenceResolver): def __init__(self, start_server=True, endpoint=CoreNLPClient.DEFAULT_ENDPOINT): self.__client = CoreNLPClient(start_server=start_server, endpoint=endpoint, annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'coref' ], output_format='json') self.__client.start() def __del__(self): self.__client.stop() def resolve_coreferences(self, text, entities): annotations = self.__client.annotate(text) entity_mention_indices = [] for chain in annotations.corefChain: mention_indices = [] for mention in chain.mention: sentence = annotations.sentence[mention.sentenceIndex] token_start = sentence.token[mention.beginIndex] token_end = sentence.token[mention.endIndex - 1] char_start = token_start.beginChar char_end = token_end.endChar mention_indices.append((char_start, char_end)) entity_mention_indices.append(mention_indices) entity_sets = [list() for _ in range(len(entity_mention_indices))] for entity in entities: is_coreferred = False for i, mention_indices in enumerate(entity_mention_indices): for start_index, end_index in mention_indices: if entity.start_offset >= start_index and entity.end_offset <= end_index: entity_sets[i].append(entity) is_coreferred = True if not is_coreferred: entity_sets.append([entity]) return entity_sets
class StanfordOpenIE: def __init__(self, core_nlp_version: str = '2018-10-05', threads: int = 5, close_after_finish: bool = True): self.remote_url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path(os.environ['STANFORD_HOME']).expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading to %s.' % self.install_dir) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.close_after_finish = close_after_finish self.client = CoreNLPClient(annotators=['openie'], memory='8G', threads=threads) def get_openie_with_boundary(self, annotation: Dict, remove_dup: bool = False) -> List[Triple]: triples: List[Triple] = [] dup: Set['unique'] = set() for sentence in annotation['sentences']: tokens = sentence['tokens'] for triple in sentence['openie']: new_triple = {} for field in ['subject', 'relation', 'object']: text = triple[field] s, e = triple[field + 'Span'] s = tokens[s]['characterOffsetBegin'] e = tokens[e - 1]['characterOffsetEnd'] new_triple[field] = Span(text=text, start=s, end=e) key = '\t'.join([ '{}-{}'.format(new_triple[field].start, new_triple[field].end) for field in ['subject', 'relation', 'object'] ]) if remove_dup and key in dup: continue triples.append(Triple(**new_triple)) dup.add(key) return triples def annotate(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True, remove_dup: bool = False, max_len: int = 15000): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict. :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>. """ if len(text) >= max_len: return [] # https://stanfordnlp.github.io/CoreNLP/openie.html core_nlp_output = self.client.annotate(text=text, annotators=['openie'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: return self.get_openie_with_boundary(core_nlp_output, remove_dup=remove_dup) else: return core_nlp_output def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): if self.close_after_finish: self.client.stop() del os.environ['CORENLP_HOME']
class RelationExtractor: def __init__(self, corenlp_home, endpoint='http://localhost:9000', timeout=15000, memory='2G'): print('Set up Stanford CoreNLP Server.') if os.path.exists(corenlp_home): os.environ['CORENLP_HOME'] = corenlp_home else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), corenlp_home) self.client = CoreNLPClient(annotators=['depparse'], endpoint=endpoint, timeout=timeout, memory=memory) self.client.annotate('Prepare.') def extract(self, text): """ extract relations from text. params: text: string return: sentences: [sentence] --format-- sentence: { 'tokens': [word], 'relations': [(subject word index, (predicate word index), object word index)] } word: string word index: int """ ann = self.client.annotate(text) sentences = [] for sentence in ann.sentence: # extract relations from sentence relations = self._extract_by_subj_obj(sentence) + self._extract_by_nmod(sentence) # deal with "of" dependency such as "a group of people", replace "group" with "people" in relations relations = self._replace_by_of(sentence, relations) sentences.append({ 'tokens': [token.word for token in sentence.token], 'relations': relations }) return sentences def _extract_by_subj_obj(self, sentence): """ extract action/verb relations by "nsubj"/"acl" and "dobj"/"acl:relcl" dependency. return: relations: [(subject word index, (predicate word index), object word index)] """ edges = sentence.enhancedPlusPlusDependencies.edge pred2insts = {} for edge in edges: subj, pred, obj = None, None, None if edge.dep == 'nsubj': subj, pred = edge.target, edge.source elif edge.dep in ['acl', 'acl:relcl']: subj, pred = edge.source, edge.target elif edge.dep == 'dobj': pred, obj = edge.source, edge.target else: continue if pred not in pred2insts: pred2insts[pred] = { 'subjs': [], 'objs': [] } if subj is None: pred2insts[pred]['objs'].append(obj) else: pred2insts[pred]['subjs'].append(subj) relations = [] for pred in pred2insts: insts = pred2insts[pred] for subj in insts['subjs']: for obj in insts['objs']: relations.append((subj - 1, tuple([pred - 1]), obj - 1)) return relations def _extract_by_nmod(self, sentence): """ extract preposition/preposition phrase and spatial relations by "nmod" dependency. return: relations: [(subject word index, (predicate word index), object word index)] """ edges = sentence.enhancedPlusPlusDependencies.edge # case: to find preposition index in tokens # mew: to concatenate preposition phrase, such as "in front of" # acl/nsubj: to concatenate verb and preposition phrase, such as "park in front of" # and: to expand relations by parallel subjects/objects dep_idx = {dep: {} for dep in ['case', 'mwe', 'acl', 'nsubj', 'conj:and']} for edge in edges: if edge.dep not in dep_idx: continue source, target = edge.source, edge.target if edge.dep == 'acl': source, target = target, source if source not in dep_idx[edge.dep]: dep_idx[edge.dep][source] = [] dep_idx[edge.dep][source].append(target) # exclude relations with "of", "for"...... exclude = ['of', 'for'] relations = [] for edge in edges: if not edge.dep.startswith('nmod:'): continue nmod = edge.dep[5:] if nmod in exclude: continue # target should be noun if not sentence.token[edge.target - 1].pos.startswith('NN'): continue # find preposition indice if edge.target not in dep_idx['case']: continue pred = None for case in dep_idx['case'][edge.target]: word_idc = [case] word = sentence.token[case - 1].word if case in dep_idx['mwe']: word_idc.extend(dep_idx['mwe'][case]) word = '_'.join([sentence.token[idx - 1].word for idx in word_idc]) if word == nmod: pred = word_idc break if pred is None: continue if sentence.token[edge.source - 1].pos.startswith('NN'): # add preposition relations relations.append((edge.source, tuple(pred), edge.target)) else: # add preposition phrase relations for dep in ['acl', 'nsubj']: if edge.source in dep_idx[dep]: if edge.source + 1 == pred[0]: # concatenate verb and preposition phrase pred.insert(0, edge.source) relations.extend([(source, tuple(pred), edge.target) for source in dep_idx[dep][edge.source]]) break # expand relations expanded_relations = set() for relation in relations: subj, pred, obj = relation equal_insts = { 'subjs': [subj], 'objs': [obj] } for typ in equal_insts: inst = equal_insts[typ][0] if inst in dep_idx['conj:and']: equal_insts[typ].extend(dep_idx['conj:and'][inst]) pred = tuple([idx - 1 for idx in pred]) for subj in equal_insts['subjs']: for obj in equal_insts['objs']: expanded_relations.add((subj - 1, pred, obj - 1)) return list(expanded_relations)
class StanfordOpenIE: def __init__(self, core_nlp_version: str = '2018-10-05'): self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path('~/.stanfordnlp_resources/').expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading from %s.' % self.remote_url) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.client = CoreNLPClient(annotators=['openie'], memory='8G') def annotate(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict. :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>. """ # https://stanfordnlp.github.io/CoreNLP/openie.html core_nlp_output = self.client.annotate(text=text, annotators=['openie'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: triples = [] for sentence in core_nlp_output['sentences']: for triple in sentence['openie']: triples.append({ 'subject': triple['subject'], 'relation': triple['relation'], 'object': triple['object'] }) return triples else: return core_nlp_output def generate_graphviz_graph(self, text: str, png_filename: str = './out/graph.png'): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) png_filename: list of annotators to use """ entity_relations = self.annotate(text, simple_format=True) """digraph G { # a -> b [ label="a to b" ]; # b -> c [ label="another label"]; }""" graph = list() graph.append('digraph {') for er in entity_relations: graph.append('"{}" -> "{}" [ label="{}" ];'.format( er['subject'], er['object'], er['relation'])) graph.append('}') output_dir = os.path.join('.', os.path.dirname(png_filename)) if not os.path.exists(output_dir): os.makedirs(output_dir) out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot') with open(out_dot, 'w') as output_file: output_file.writelines(graph) command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename) dot_process = Popen(command, stdout=stderr, shell=True) dot_process.wait() assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.' def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): self.client.stop() del os.environ['CORENLP_HOME']
class Evaluator: def __init__(self, tagged_dataset_path, database_path, corenlp_path): self.target_values_map = {} for filename in os.listdir(tagged_dataset_path): filename = os.path.join(tagged_dataset_path, filename) print(sys.stderr, 'Reading dataset from', filename) with open(filename, 'r', 'utf8') as fin: header = fin.readline().rstrip('\n').split('\t') for line in fin: stuff = dict(zip(header, line.rstrip('\n').split('\t'))) ex_id = stuff['id'] original_strings = tsv_unescape_list(stuff['targetValue']) canon_strings = tsv_unescape_list(stuff['targetCanon']) self.target_values_map[ex_id] = to_value_list( original_strings, canon_strings) os.environ['CORENLP_HOME'] = corenlp_path self.client = CoreNLPClient(annotators="ner".split()) self.db_path = database_path def evaluate(self, predictions): num_examples, num_correct = 0, 0 num_fail = 0 for pred in predictions: table_id = pred['table_id'] #### find the exact db file db_file = self.db_path + table_id + '.db' table_file = self.db_path + "../json/" + table_id + ".json" with open(table_file, "r") as f: table_json = json.load(f) connection = sqlite3.connect(db_file) c = connection.cursor() results = pred['result'] for result in results: ex_id = result['id'] sql = result['sql'] try: sql = requests.get("http://localhost:3000/", json={ "sql": sql, "is_list": table_json["is_list"] }).json() c.execute(sql) answer_list = list() nlp_list = list() for result, in c: result = str(result) ann = self.client.annotate(result) if len(ann.mentions) == 0: nlp_list.append(result) elif len(ann.mentions) > 1: #print('corenlp annotation wrong!', ann.mentions) nlp_list.append(result) else: nlp_list.append(ann.mentions[0].normalizedNER) answer_list.append(result) predicted_values = to_value_list(answer_list) except Exception as e: # print("Evaluation failure", e) num_fail += 1 predicted_values = list() if ex_id not in self.target_values_map: print('WARNING: Example ID "%s" not found' % ex_id) else: target_values = self.target_values_map[ex_id] correct = check_denotation(target_values, predicted_values) num_examples += 1 if correct: num_correct += 1 # acc = (num_correct + 1e-9) / (num_examples + 1e-9) #print("Failed:", num_fail, "out of", num_examples) return num_correct
print(text1) cwd = os.getcwd() version = 'stanford-corenlp-full-2018-10-05' corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version os.environ["CORENLP_HOME"] = corenlp_path corenlpclient_UD1 = CoreNLPClient( properties={'ssplit.isOneSentence': True}, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=False, max_char_length=100000, output_format='conllu') _UD1_Auto = corenlpclient_UD1.annotate(text1) # annotators = ['tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'] # _UD1_Auto = _UD1_Auto['sentences'][1]['basicDependencies'] # extract only basic dependencies print(_UD1_Auto) corenlpclient_UD1.stop() print( convert_const2dep( LANG, dataset, filename='', readpath='/02_modelbuilding/02_output/input_temp.parser', writepath='/02_modelbuilding/02_output/output_temp.parser', format_='UD1', usage='experiments'))
class Preprocessor(object): """ The preprocessor wraps a corpus object (usually a `HTMLCorpusReader`) and manages the stateful tokenization and part of speech tagging into a directory that is stored in a format that can be read by the `HTMLPickledCorpusReader`. This format is more compact and necessarily removes a variety of fields from the document that are stored in the JSON representation dumped from the Mongo database. This format however is more easily accessed for common parsing activity. """ def __init__(self, corpus, target=None, **kwargs): """ The corpus is the `HTMLCorpusReader` to preprocess and pickle. The target is the directory on disk to output the pickled corpus to. """ self.corpus = corpus self.target = target self.tagger = pos_tagger('spacy') # Modification for dibutade if model == 'stanford': os.environ[ 'CORENLP_HOME'] = 'C:/Users/alain/OneDrive/Ateliers Dibutade/NLP/stanford-corenlp-full-2018-10-05' self.pos_tagger = CoreNLPClient(properties='french', annotators=[ 'pos', ], timeout=30000, memory='1G') elif model == 'spacy': self.nlp = spacy.load('fr_core_news_sm') def fileids(self, fileids=None, categories=None): """ Helper function access the fileids of the corpus """ fileids = self.corpus.resolve(fileids, categories) if fileids: return fileids return self.corpus.fileids() def abspath(self, fileid): """ Returns the absolute path to the target fileid from the corpus fileid. """ # Find the directory, relative from the corpus root. parent = os.path.relpath(os.path.dirname(self.corpus.abspath(fileid)), self.corpus.root) # Compute the name parts to reconstruct basename = os.path.basename(fileid) name, ext = os.path.splitext(basename) # Create the pickle file extension basename = name + '.pickle' # Return the path to the file relative to the target. return os.path.normpath(os.path.join(self.target, parent, basename)) def tokenize(self, fileid): """ Segments, tokenizes, and tags a document in the corpus. Returns a generator of paragraphs, which are lists of sentences, which in turn are lists of part of speech tagged words. """ for paragraph in self.corpus.paras(fileids=fileid): if model == 'original': for sent in sent_tokenize(paragraph): print(sent) print(wordpunct_tokenize(sent)) print(pos_tag(wordpunct_tokenize(sent))) key = input('Continue') yield [ pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(paragraph) ] elif model == 'stanford': # Modification for the CORE NLP package ann = self.pos_tagger.annotate(paragraph) for sentence in ann.sentence: #print(sentence) #for token in sentence.token : # print((token.word, token.pos)) yield [[(token.word, token.pos) for token in sentence.token]] elif model == 'spacy': yield [[(token.text, token.pos_) for token in self.nlp(sent)] for sent in sent_tokenize(paragraph)] else: # Default - still to test for sent in sent_tokenize(paragraph): yield self.tagger.pos_tag(sent) def process(self, fileid): """ For a single file does the following preprocessing work: 1. Checks the location on disk to make sure no errors occur. 2. Gets all paragraphs for the given text. 3. Segments the paragraphs with the sent_tokenizer 4. Tokenizes the sentences with the wordpunct_tokenizer 5. Tags the sentences using the default pos_tagger 6. Writes the document as a pickle to the target location. This method is called multiple times from the transform runner. """ # Compute the outpath to write the file to. target = self.abspath(fileid) parent = os.path.dirname(target) # Make sure the directory exists if not os.path.exists(parent): os.makedirs(parent) # Make sure that the parent is a directory and not a file if not os.path.isdir(parent): raise ValueError( "Please supply a directory to write preprocessed data to.") # Create a data structure for the pickle document = list(self.tokenize(fileid)) # Open and serialize the pickle to disk with open(target, 'wb') as f: pickle.dump(document, f, pickle.HIGHEST_PROTOCOL) # Clean up the document del document # Return the target fileid return target def transform(self, fileids=None, categories=None): """ Transform the wrapped corpus, writing out the segmented, tokenized, and part of speech tagged corpus as a pickle to the target directory. This method will also directly copy files that are in the corpus.root directory that are not matched by the corpus.fileids(). """ # Make the target directory if it doesn't already exist if not os.path.exists(self.target): os.makedirs(self.target) # Resolve the fileids to start processing and return the list of # target file ids to pass to downstream transformers. return [ self.process(fileid) for fileid in self.fileids(fileids, categories) ]
def __populate_Parses(lang, parsejson, new_parsedict): """ """ # start CoreNLP servers for UD1 from stanfordnlp.server import CoreNLPClient cwd = os.getcwd() version = 'stanford-corenlp-full-2018-10-05' corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version os.environ["CORENLP_HOME"] = corenlp_path if lang == 'en': lang = {} # i.e. CoreNLP defaults to English model corenlpclient_UD1 = CoreNLPClient(properties={ 'ssplit.isOneSentence': True, 'tokenize.whitespace': True }, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu') # parse annotator is necessary to obtain udfeats (for postags) if lang == 'fr': lang = 'french' corenlpclient_UD1 = CoreNLPClient( properties=lang, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu' ) # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html if lang == 'zh': lang = 'chinese' corenlpclient_UD1 = CoreNLPClient(properties=lang, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu') # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html # begin processing for DocID in parsejson: print('Now processing: ', dataset, DocID) sentence_offset = 0 # this is the 4th element in a TokenList # obtain the gold constituency parses for the document. ConstTrees = __obtain_ConstTrees_Gold( DocID, readpath='./03_data/{}/{}tbRoot/{}/', lang=LANG) for sentence in parsejson[DocID]['sentences']: # 1. create a ParsePDTB object __parsepdtb = ParsePDTB( lang=LANG, docid=DocID, sentid=sentence_offset, gold_consttree=ConstTrees[sentence_offset], pdtb_version=PDTB_VERSION) # 2. add to .RawText and .Words __parsepdtb.RawText = " ".join( [word[0] for word in sentence['words']]) __parsepdtb.Words = sentence['words'] # 3. add to ConstTree_Auto. generate parse if missing if sentence['parsetree'] == '(())\n': _parse = a2_parsers._parse_rawtext2consttree( LANG, __parsepdtb.RawText, tokenized=True) __parsepdtb.ConstTree_Auto = _parse else: __parsepdtb.ConstTree_Auto = sentence['parsetree'] # 3. write to temp file, for converting to SD/UD1 in next steps with open('./02_modelbuilding/02_output/input_temp.parser', 'w+') as f: f.write(__parsepdtb.ConstTree_Gold) # 4. convert constituency parse to gold UD 1.0 and add to DepTree_UD1_Gold a2_parsers.convert_const2dep( LANG, dataset, filename='', readpath='/02_modelbuilding/02_output/input_temp.parser', writepath='/02_modelbuilding/02_output/output_temp.parser', format_='UD1', usage='experiments') with open('./02_modelbuilding/02_output/output_temp.parser', 'r') as f: UD1_Gold_conllu = f.read() def __conllu2tuple(conllu_doc): """helper function to convert CoNLL format into 3-tuple used by CoNLL 2016 organisers to store dependency parses """ to_list = conllu_doc.split('\n') tokenlist = [ i.split('\t')[1] + '-' + i.split('\t')[0] for i in to_list if i != '' ] # convert CoNLL line to <wordform>-<token num> tokenlist.insert(0, 'ROOT-0') # add a root token to the start deptree_gold = [ [ i.split('\t')[7], tokenlist[int(i.split('\t')[6])], i.split('\t')[1] + '-' + i.split('\t')[0] ] for i in to_list if i != '' ] # convert to CoNLL 2016 dependencies format return deptree_gold __parsepdtb.DepTree_UD1_Gold = __conllu2tuple(UD1_Gold_conllu) # 5. automatically generate UD 1.0 constituency parse (from raw text), place into same 3-tuple format as CoNLL 2016 Shared Task,and add to DepTree_UD1_Auto UD1_Auto_conllu = corenlpclient_UD1.annotate( __parsepdtb.RawText) __parsepdtb.DepTree_UD1_Auto = __conllu2tuple(UD1_Auto_conllu) # 6. add PTB-style and UD pos tags to .Words. Each of the variable below contain a list comprising 2-tuples. each tuple is (<wordform>, <part of speech>) globals()['pos_PTBGold'] = [ i for i in ParentedTree.fromstring( __parsepdtb.ConstTree_Gold).pos() if i[-1] != '-NONE-' ] # gold PTB parses have traces and these causes misalignment with the surface form. we drop these since parsers don't predict traces (Johannsen & Søgaard, 2013) globals()['pos_PTBAuto'] = ParentedTree.fromstring( __parsepdtb.ConstTree_Auto).pos() globals()['pos_UDGold'] = [(i.split('\t')[1], i.split('\t')[3]) for i in UD1_Gold_conllu.split('\n') if i != ''] globals()['pos_UDAuto'] = [(i.split('\t')[1], i.split('\t')[3]) for i in UD1_Auto_conllu.split('\n') if i != ''] for postagset in ['PTBGold', 'PTBAuto', 'UDGold', 'UDAuto']: try: _tagset = globals()['pos_' + postagset] assert len(_tagset) == len(__parsepdtb.Words) for idx in range(len(__parsepdtb.Words)): # add the part of speech as a new key in the dictionary for the token in .Words __parsepdtb.Words[idx][1].update( {'PartOfSpeech_' + postagset: _tagset[idx][1]}) except AssertionError as e: e.args += ( postagset.upper() + " is not of the same size as the .Words attribute for this sentence.", ) print(e) print("Continuing to attempt alignment of tokens.") _words = [i[0] for i in __parsepdtb.Words] _words_maxidx = len(_words) - 1 #'drop' the additional tokens in _tagset _tagset = [i for i in _tagset if i[0] in _words] _words_curridx = -1 # start with -1 for idx in range(len(_tagset)): _words_curridx += 1 while __parsepdtb.Words[_words_curridx][ 0] != _tagset[idx][ 0] and _words_curridx < _words_maxidx: __parsepdtb.Words[_words_curridx][1].update( { 'PartOfSpeech_' + postagset: 'ParserError' } ) # place a marker identifying the missing pos tag as an error from parsing _words_curridx += 1 __parsepdtb.Words[_words_curridx][1].update( {'PartOfSpeech_' + postagset: _tagset[idx][1]}) continue # raise sentence_offset += 1 # increase sentence offset before moving to handle next sentence try: new_parsedict[DocID].append(__parsepdtb) except: new_parsedict[DocID] = [__parsepdtb] # shut down the CoreNLP servers corenlpclient_UD1.stop()
class JobSegmenter(object): """ default_heading_idx(int): level associated with default header headers are usually h1 -> h6 therefore any integer > 6 may be ok. verbose(bool): well... use_tags(bool): whether to tag data (with paragraphs, title, names) interventions(bool): switch to intervention segmentation i.e. map paragraphs to intervention in ctm """ def __init__(self, default_heading_idx=DEFAULT_HEADING_IDX, verbose=False, use_tags=False, interventions=False, sentence_slices=False, tfidf=False, sentence_min_length=1): # corenlp_port=9000): self.corenlp_client = None self.ses = None self.default_heading_idx = default_heading_idx self.n_headings = 6 self.paragraph_min_word_length = 2 self.verbose = verbose self.use_tags = use_tags self.interventions = interventions self.sentence_slices = sentence_slices self.tfidf = tfidf self.sentence_min_length = sentence_min_length # self.corenlp_port = corenlp_port # if self.interventions: # raise ValueError("Intervention mode not implemented") def __enter__(self): return self def __exit__(self, *args, **kwargs): if self.corenlp_client: self.corenlp_client.stop() def make_sentence_similarity(self, word2vec_path=DEFAULT_WORD2VEC_PATH, n_vectors=DEFAULT_N_VECTOR): if self.tfidf: from exp_tfidf_lsa_kmeans.tfidf import scoring self.tfidf_scoring = scoring else: from word2vec_fr.sentence_similarity import SentenceEmbeddingSimilarity self.ses = SentenceEmbeddingSimilarity(word2vec_path, n_vectors) def make_corenlp_client(self, annotators=["tokenize", "ssplit"], endpoint="http://localhost:9000", properties_name="french", properties_dict=None, quiet=True): LEGACY_PROPERTIES = {} FRENCH_PROPERTIES = { "tokenize.language": "French", "tokenize.options": "ptb3Dashes=true" } PROPERTIES = {"legacy": LEGACY_PROPERTIES, "french": FRENCH_PROPERTIES} if properties_dict is not None: properties = properties_dict else: if properties_name in PROPERTIES.keys(): properties = PROPERTIES[properties_name] else: raise ValueError("Unknow properties '%s'" % properties_name) devnull = open(os.devnull) stdout = devnull if quiet else sys.stdout stderr = devnull if quiet else sys.stderr self.corenlp_client = \ CoreNLPClient(annotators=annotators, endpoint=endpoint, stdout=stdout, stderr=stderr, memory="8G", heapsize="8G", threads=8, timeout=15000, properties=properties ) def heading_idx(self, style_name): if style_name is None: return self.default_heading_idx if style_name.lower().startswith( "heading") or style_name.lower().startswith("titre"): try: idx = int(style_name[-1]) - 1 return idx except ValueError as e: return self.default_heading_idx else: return self.default_heading_idx def is_toc(self, style_name): if style_name is None: return False style_name = style_name.lower() return (style_name.lower().startswith("contents") or style_name.lower().startswith("toc") or style_name.lower().startswith("en-t") or style_name.lower().startswith("tm")) def is_name(self, style_name): if style_name is None: return False style_name = style_name.lower() return (style_name.startswith('nom') or style_name.startswith('intervenant')) def section(self, level=None, content=[], parent=None): if level is None: level = self.self.default_heading_idx kwargs = locals() self = kwargs.pop("self") parent = kwargs.pop('parent') d = dict(kwargs) d["childs"] = [] if parent is not None: d["parent"] = parent parent["childs"].append(d) return d def annotate(self, text): """ Args: text(string) Returns: annotation object """ if self.corenlp_client is None: raise ValueError("'self.corenlp_client' is None. " "Use 'make_corenlp_client' before calling " "'annotate'") while True: try: r = self.corenlp_client.annotate(text) break except (requests.exceptions.ConnectionError, corenlp_client.PermanentlyFailedException, urllib3.exceptions.MaxRetryError): print("too many requests, sleeping") time.sleep(0.75) return r def get_sentences(self, words, lower=True, no_minimum=False, with_scores=False, debug=False): # return [_.split() for _ in text.split(".")] return self.corenlp_get_sentences(words, lower=lower, no_minimum=no_minimum, with_scores=with_scores, debug=debug) def corenlp_get_sentences(self, words, lower=True, no_minimum=False, with_scores=False, debug=False): """ Args: sentences: list[list[word]] if not with_scores list[list[ [word; score]] otherwise Returns: sentences: list of sentences (list of word (string) list[list[str]] or words is [word, score] if with_scores """ _debug = debug def debug(*args, **kwargs): if _debug: print(*args, **kwargs) def maybe_lower(t): return t.lower() if lower else t if not with_scores: # note replacing spe quote only needed for french for aujourd'hui ann = self.annotate(" ".join(words).replace("’", "'")) sentences = [[maybe_lower(token.word) for token in sentence.token] for sentence in ann.sentence] sentences = [ s for s in sentences if no_minimum or len(s) >= self.sentence_min_length ] return sentences else: words, scores = zip(*[(w, s) for w, s in words]) debug(words) debug(scores) text = " ".join(words) ann = self.annotate(text.replace("’", "'")) count = 0 sentences = [] prev_word = "" prev_score = "" word_done = True wip = "" for sentence in ann.sentence: sentences.append([]) for token in sentence.token: debug("'%s' ~= '%s'" % (words[count], token.word)) wip += token.word score = scores[count] if not wip == words[count]: debug("Incomplete word, wip='%s'" % wip) else: wip = "" count += 1 # if not token.word == words[count]: # if word_done and words[count].startswith(token.word): # word_done = False # debug("Incomplete word: begining") # elif words[count].endswith(token.word): # debug("Incomplete word: end") # word_done = True # count += 1 # elif token.word in words[count]: # debug("Incomplete word: middle") # pass # else: # raise ValueError("mismatch '%s' and '%s'" % (words[count], token.word)) # else: # count += 1 sentences[-1].append([maybe_lower(token.word), score]) def to_old_style(s): return " ".join([w[0] for w, _ in s]).replace("' ", "'").split() sentences = [ s for s in sentences if no_minimum or len(to_old_style(s)) >= self.sentence_min_length ] assert count == len(scores) == len(words) return sentences def flatten_document(self, document, implicit_nom=False, exclude_toc=True): """ Args: document(docx.Document) Returns: sections(list[section]) with: section: list[sentence] sentence: list[word(str)] finally sections is list[list[list[word(str)]]] NOTE: now word is actually [word, score] """ implicit_nom_file = open('implicit_nom.lst_', 'a') unique_noms = set() cur_section = [] sections = [cur_section] cur_lvl = -1 style_error_p = [] cur_txt_len = 0 last_tag = TAGLESS # for p in document.paragraphs: for elmt in docx_iter(document): tag = TAGLESS if is_p_elmt(elmt): self.log("p elmt, style=%s" % elmt.style) try: style = elmt.style if exclude_toc and self.is_toc(style): continue lvl = self.heading_idx(style) if lvl == self.default_heading_idx: tag = TAG_P else: tag = TAG_H if self.is_name(style): tag = TAG_NAME except AttributeError as e: style_error_p += [elmt] lvl = self.default_heading_idx tag = TAG_P raise e else: if is_tbl_elmt(elmt): tag = TAG_TABLE elif is_row_elmt(elmt): tag = TAG_ROW lvl = self.default_heading_idx self.log("lvl: %d" % lvl) # sentences = list[list[str]] # words = list[str] words = elmt2txt(elmt).split() self.log("'%s'\n" % [_.lower() for _ in words[:150]]) no_minimum = True sentences = self.get_sentences(words, no_minimum=no_minimum, with_scores=False) if implicit_nom: assert self.interventions def _add_implicit_nom(nom): __nom = " ".join(nom) if not __nom in unique_noms: print(__nom, file=implicit_nom_file) unique_noms.add(__nom) if len(sentences) > 0: s = " ".join(sentences[0]) if "--" in sentences[0]: # <nom> -- intervention count = sentences[0].count("--") pos = sentences[0].index("--") if count == 1 and pos < 7 and len(sentences) > 1: nom = s.split("--")[0].split() intervention = " ".join(s.split("--")[1:]).split() _nom = ["<nom>"] + nom + ["</nom>"] _intervention = ["<%s>" % tag] + intervention sentences[-1].append("</%s>" % tag) _sentences = [_nom, _intervention] + sentences[1:] cur_section = _sentences sections.append(cur_section) _add_implicit_nom(nom) continue elif len(sentences[0]) < 5 and any([ s.lower().startswith(_) for _ in ["monsieur", "madame", "m.", "mme.", "mr."] ]): # <monsieur|madame|..> <nom> \n text # print("monsieur|madame detected") nom = sentences[0] _nom = ["<nom>"] + nom + ["</nom>"] _sentences = [_nom] if len(sentences) > 1: sentences[1] = ["<%s>" % tag] + sentences[1] sentences[-1] = sentences[-1] + ["</%s>" % tag] _sentences += sentences[1:] cur_section = _sentences sections.append(cur_section) _add_implicit_nom(nom) continue words = flatten_list(sentences) cur_txt_len += len(words) if self.use_tags: if not len(sentences) > 0: sentences = [[]] # continue sentences[0] = ["<%s>" % tag] + sentences[0] sentences[-1] = sentences[-1] + ["</%s>" % tag] if "".join(words) == "e-customer": words = ["e", "-", "customer"] if len(words) == 0: continue if len(words) < self.paragraph_min_word_length: if not tag in [TAG_NAME, TAG_H]: continue if self.interventions: if tag in [TAG_NAME, TAG_H] and last_tag != TAG_H: cur_section = sentences sections += [cur_section] else: cur_section += sentences else: # sections mode: if lvl <= cur_lvl: # new section if cur_txt_len > 0: cur_section = sentences sections += [cur_section] cur_txt_len = 0 cur_lvl = lvl else: # appending content to current section cur_section += sentences if lvl < self.n_headings: # only updates the level in case of header cur_lvl = lvl last_tag = tag return sections def log(self, *args, **kwargs): if self.verbose: print(*args, **kwargs) def process_docx(self, docx_path, implicit_nom=False, verbose=False): """ Args: docx_path(str) Returns: sentences(list[str]) slices(list[slice]) """ document = docx.Document(docx_path) # structure = self.get_docx_structure(document) # flat_structure = self.flatten_section(structure) sections = self.flatten_document(document, implicit_nom=implicit_nom) sentences = flatten_list(sections) if self.sentence_slices and not self.interventions: slices = [slice(i, i + 1) for i in range(len(sentences))] else: slices = [] lower = 0 for section in sections: if len(section) == 0: continue upper = lower + len(section) slices += [slice(lower, upper)] lower = upper return sentences, slices def process_ctm(self, ctm_paths, get_scores=False, debug=False): """ Args: ctm_paths(list[string]) Returns: ctm_slices(list[slice]) ctm_sentences(list[string]) """ paroles = [] cur_sentence = "" ctm_sentences = [] # 1. CTM ->> List of paroles for ctm_path in ctm_paths: with open(ctm_path, 'rb') as f_ctm: for line in f_ctm: try: word, score = line.decode('utf-8').split("\t")[4:6] score = float(score) except UnicodeDecodeError as e: print("UnicodeDecodeError on file '%s'" % ctm_path) raise e if word.startswith("<start="): if len(paroles) == 0 or len(paroles[-1]) > 0: paroles.append([]) continue paroles[-1].append([word, score]) # 2. List of paroles ->> List of lists of sentences (and ranges) if self.sentence_slices: assert not get_scores, "Not implemented" ctm_sentences = [] for parole in paroles: sentences = [ _ for _ in self.get_sentences(parole, with_scores=True) if len(_) > 0 ] ctm_sentences += sentences parole_slices = [ slice(i, i + 1) for i in range(len(ctm_sentences)) ] else: ctm_sentences = [] parole_slices = [] lower = 0 for i_parole in range(len(paroles)): parole = paroles[i_parole] self.log("***") self.log(parole) ## ann = client.annotate(parole) # sentences = [[token.word.lower() # for token in sentence.token] # for sentence in ann.sentence] sentences = self.get_sentences(parole, with_scores=True, debug=debug) if len(sentences) == 0: continue self.log(sentences) ctm_sentences += sentences upper = lower + len(sentences) parole_slices.append(slice(lower, upper)) lower = upper return ctm_sentences, parole_slices
from graph import Graph import pickle logging.getLogger("transformers.tokenization_utils").setLevel(logging.WARNING) install_dir = Path('~/stanfordnlp_resources/').expanduser() text = 'Barack Obama was born in Hawaii. He wrote this sentence.' os.environ['CORENLP_HOME'] = str(install_dir / 'stanford-corenlp-full-2018-10-05') properties = {} from stanfordnlp.server import CoreNLPClient client = CoreNLPClient(annotators=['openie'], memory='6G', properties=properties) client.annotate(text="time pass", annotators=['openie'], output_format='json') nlp = spacy.load('en') coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') nlplem = spacy.load('en') SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] ATTR_TO_SPECIAL_TOKEN = { 'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>', 'additional_special_tokens': ['<speaker1>', '<speaker2>'] } MODEL_INPUTS = [ "input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"
# Start the background server and wait for some time # Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed client.start() import time; time.sleep(10) !ps -o pid,cmd | grep java from google.colab import drive drive.mount('/content/gdrive') with open('/content/gdrive/My Drive/Colab Notebooks/chapter1.txt', 'r') as file: data = file.read().replace('\n', '') #data = "Such were some of various omens. Emperor Ling, greatly moved by these signs of the displeasure of Heaven, issued an edict asking his ministers for an explanation of the calamities and marvels." document = client.annotate(data) print(type(document)) # Iterate over all detected entity mentions print("{:30s}\t{}".format("Mention", "Type")) listofchar = [] for sent in document.sentence: for m in sent.mentions: #print(type(m.entityType)) #print("{:30s}\t{}".format(m.entityMentionText, m.entityType)) if m.entityType[0] is "P": if m.entityMentionText not in listofchar: listofchar.append(m.entityMentionText) #!pip install pycorenlp #from pycorenlp import StanfordCoreNLP
properties = {'ner.model': './stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'} ''' # In[ ]: os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05' properties = { 'ner.model': './stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz,' './stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz,' './stanford-ner-2018-10-16/classifiers/english.conll.4class.distsim.crf.ser.gz' } client = CoreNLPClient(annotators=['tokenize', 'pos', 'lemma', 'ner'], memory='8g', endpoint='http://localhost:9001') doc = client.annotate(text) for sent in doc.sentence: for m in sent.mentions: print(m.entityMentionText, '\t\t\t', m.entityType) client.stop() ## do not forget to stop the client # In[ ]: # In[ ]: ## nltk nltk.download() # d-punkt-q st = StanfordNERTagger( 'stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')
class Tokenizer: def __init__(self) -> None: os.environ[ 'CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format( os.environ['HOME']) self.client = CoreNLPClient() self.client.ensure_alive() self.do_lower_case = '-cased' not in config.bert_model self.basic_tokenizer: BasicTokenizer \ = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer def __del__(self) -> None: for p in glob.glob('corenlp_server-*.props'): if os.path.isfile(p): os.remove(p) def tokenize(self, doc: str) -> List[Sentence]: splitter_annotation \ = self.client.annotate(doc, annotators=['ssplit'], properties={'tokenize.options': 'ptb3Escaping=false,invertible=true'}) end = 0 sentences = [] for sentence in splitter_annotation.sentence: begin = doc.index(sentence.token[0].originalText, end) for token in sentence.token: end = doc.index(token.originalText, end) + len( token.originalText) text = doc[begin:end] sentences.append(Sentence(text, begin, end)) sentences = self.fix_split(sentences) for sentence in sentences: text = sentence.text if self.do_lower_case: text = text.lower() bert_tokens = self.basic_tokenizer.tokenize(text) end = 0 tokens = [] for bert_token in bert_tokens: word = bert_token begin = text.index(word, end) end = begin + len(word) tokens.append( Token(word, sentence.begin + begin, sentence.begin + end)) assert len(tokens) > 0 sentence.tokens = tokens return sentences @staticmethod def fix_split(sentences: List[Sentence]) -> List[Sentence]: result = [] i = 0 while i < len(sentences): sentence = sentences[i] while True: next_sentence = sentences[ i + 1] if i < len(sentences) - 1 else None if '\n\n' in sentence.text: index = sentence.text.index('\n\n') new_sentence = Sentence(sentence.text[:index], sentence.begin, sentence.begin + index) result.append(new_sentence) index += re.search(r'[\n\t ]+', sentence.text[index:]).end() sentence.text = sentence.text[index:] sentence.begin += index elif next_sentence is not None and next_sentence.begin == sentence.end: sentence.text += next_sentence.text sentence.end = next_sentence.end i += 1 else: result.append(sentence) break i += 1 return result
class pos_tagger(): """ Class to impement part of speech tagging (pos tagging) """ def __init__(self, tagger='spacy', language='french'): self.tagger = tagger self.tagmodule = None self.tagset = UTagSet # TAG Set by default self.language = language spacy_module = { 'french': 'fr_core_news_sm', 'english': 'en_core_web_sm' } if tagger == 'spacy': self.tagger = self.spacy_pos_tag self.tagset = UDTagSet try: self.tagmodule = spacy.load(spacy_module[language]) except: logger.warning( 'Module for language [{:s}] not installed for Spacy - using french by default' .format(language)) self.tagmodule = spacy.load(spacy_module['french']) elif tagger == 'stanford': self.tagger = self.stanford_pos_tag self.tagset = FTTagSet JAVAHOME = "C:/Program Files (x86)/Java/jre1.8.0_241/bin/java.exe" # Set a JAVAHOME environment variable if not present if not 'JAVAHOME' in os.environ: os.environ['JAVAHOME'] = JAVAHOME root_path = "./stanford-postagger/" # location of Stanford POS Tagger components # Launch the Stanford Pos Tagger (implemented in Java) self.tagmodule = StanfordPOSTagger( root_path + "models/" + language + ".tagger", root_path + "stanford-postagger.jar", encoding='utf8') elif tagger == 'core_nlp': self.tagger = self.corenlp_pos_tag os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05' try: self.tagmodule = CoreNLPClient(properties=language, annotators=[ 'pos', ], timeout=30000, memory='1G') except: logger.warning( 'Could not launch Stanford Core NLP for [{:s}]'.format( language)) elif tagger == 'nltk': self.tagger = self.nltk_pos_tag self.tagset = NLTKTagSet if language != 'english': logger.warning( 'nltk does not support [{:s}] language'.format(language)) else: logger.warning('POS tagger [{:s}] unknown'.format(tagger)) def pos_tag(self, sentence): assert (self.tagger) return self.tagger(sentence) def spacy_pos_tag(self, sentence): assert (self.tagmodule) return [(token.text, token.pos_) for token in self.tagmodule(sentence)] def stanford_pos_tag(self, sentence): assert (self.tagmodule) return self.tagmodule.tag(nltk.word_tokenize(sentence)) def corenlp_pos_tag(self, sentence): # Unchecked # DOes not seem to work assert (self.tagmodule) ann = self.tagmodule.annotate(sentence) return [(token.word, token.pos) for token in ann.sentence[0].token] def nltk_pos_tag(self, sentence): return nltk.pos_tag(nltk.word_tokenize(sentence)) def tag_label(self, tag): return self.tagset.get(tag, '??')