class StanfordParser(object): def __init__(self, nlp=None, annots=None, props=None): if annots is None: annots = "tokenize pos lemma depparse" if nlp is None: self.nlp_client = CoreNLPClient(annotators=annots, output_format='json') else: self.nlp_client = nlp if props is not None: self.nlp_client.default_properties.update(props) _ = self.nlp_client.annotate("Let's get this party started!") del (_) def get_parse(self, sentence): return self.nlp_client.annotate(sentence) def get_deps(self, sentence, deptype='basicDependencies', ret='asis'): if isinstance(sentence, str): sentence = self.get_parse(sentence)['sentences'][0] deps = sentence[deptype] if ret == 'asis': retval = deps else: retval = {} retval['deps'] = {x['dep']: x['dependent'] for x in deps} retval['heads'] = { x['dependentGloss']: x['governorGloss'] for x in deps } retval['governors'] = { x['dependent']: x['governorGloss'] for x in deps } retval['dependents'] = { x['dependent']: x['dependentGloss'] for x in deps } retval['text'] = [ "{}({}-{}, {}-{})".format(x['dep'], x['governorGloss'], x['governor'], x['dependentGloss'], x['dependent']) for x in deps ] return retval
class StanfordService: def __init__(self, parser_path: str): os.environ['JAVANLP_HOME'] = parser_path print( 'starting CoreNLP server with JAVANLP_HOME {}'.format(parser_path)) self.nlp = CoreNLPClient(annotators="tokenize ssplit".split(), timeout=1000000) def tokenize(self, text: str) -> List[Token]: for _ in range(10): try: annotated_result = self.nlp.annotate(text) stanford_document = StanfordDocument.from_proto( annotated_result) return StanfordService.idiomatic_tokens(stanford_document) except: print('exception while annotating result') sleep(10) @staticmethod def idiomatic_tokens(doc: StanfordDocument): stanford_tokens = [ token for sentence in doc.sentences for token in sentence.tokens ] return [ StanfordService.idiomatic_token(token, index) for index, token in enumerate(stanford_tokens) ] @staticmethod def idiomatic_token(token: StanfordToken, token_index: int) -> Token: return Token(token.originalText, token_index, token.characterOffsetBegin)
class StanfordCoreferenceResolution: """ Stanford CoreNLP co-reference. Parameters ---------- timeout : int The timeout for the parser Defaults to 30000 memory : str The memory allocation. Defaults to '6G' """ def __init__(self, timeout=30000, memory='6G'): self.detok = TreebankWordDetokenizer() self.client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'dcoref'], output_format='json', timeout=timeout, memory=memory) def resolve(self, doc, raise_errors=True): """ Resolve the co-references for a single document. Parameters ---------- doc : str A document whose co-references will be resolved. raise_errors : bool, optional Whether to raise errors. Defaults to True. Returns ------- resolve_doc : str or None A document whose co-references have been resolved. If there was a problem and `raise_errors=False`, then `None` will be returned. """ try: parsed = self.client.annotate(doc) except Exception as error: if raise_errors: raise error return return self.replace_coreferences(parsed) def resolve_all(self, docs, raise_errors=True): """ Resolve co-references for all the documents. Parameters ---------- docs : list of str A list of documents raise_errors : bool, optional Whether to raise errors. Defaults to False. Returns ------- resolved_docs : list of str A list of documents, with co-references resolved. """ resolved_docs = [] for doc in tqdm(docs): resolved_docs.append(self.resolve(doc, raise_errors)) return resolved_docs @staticmethod def restructure_coreference_dict(corefs_dict): """ Given a dictionary of co-references, restructure it into a new dictionary where the keys are sentence numbers and the values are lists of references that need to be resolved. Parameters ---------- corefs_dict : dict A co-reference dictionary, output from Stanford. """ corefs_list = [ corefs_dict[key] for key in corefs_dict if len(corefs_dict[key]) > 1 and any( not co['isRepresentativeMention'] for co in corefs_dict[key]) ] corefs_dict = defaultdict(list) for i, coref in enumerate(corefs_list): # get the first representative mention from the list; # if there are no representative mentions, continue represent = [ co['text'] for co in coref if co['isRepresentativeMention'] ] if len(represent) >= 1: represent = represent[0] else: continue # loop through the (non-representative) mentions, # add to the dictionary list for that sentence for co in coref: if not co['isRepresentativeMention']: mention = { 'represent': represent, 'text': co['text'], 'startIndex': co['startIndex'], 'endIndex': co['endIndex'], 'sentNum': co['sentNum'] } corefs_dict[co['sentNum']].append(mention) return corefs_dict def replace_coreferences(self, parsed): """ We want to replace all the references with their representative mention. Parameters ---------- parsed : dict The full output from Stanford, with co-references and sentences. """ corefs = parsed['corefs'] sents = parsed['sentences'] corefs_dict = self.restructure_coreference_dict(corefs) sents = [[s['word'] for s in sent['tokens']] for sent in sents] sents_new = [] # we do this on a sentence-by-sentence basis for sent_i, sent in enumerate(sents, start=1): sent_new = [] # we check to see if the sentence is in the co-reference dictionary; # if it's not we won't need to do anything. if sent_i in corefs_dict: last_end = 0 # we loop through the (sorted) references and add them # to our new sentence list one-by-one, being careful to # capture any preceding or ending text sorted_sent = sorted(corefs_dict[sent_i], key=lambda x: x['startIndex']) for co_i, co in enumerate(sorted_sent): start = co['startIndex'] - 1 end = co['endIndex'] - 1 represent = co['represent'] # here we want to check whether this is the first co-reference; # if it is, then we need to get any text *before* it if co_i == 0: sent_new.extend(sent[:start]) sent_new.append(represent[0].upper() + represent[1:] if start == 0 else represent) # otherwise, we just get the co-reference and anything # between it and the preceding end from the previous co-reference else: sent_new.extend(sent[last_end:start]) sent_new.append(represent) last_end = end sent_new.extend(sent[last_end:]) else: sent_new = sent sents_new.append(sent_new) # we need to detokenize the sentence; basically this handles # putting punctuation and weird symbols for parentheses back together sents = ' '.join([ self.detok.detokenize(sent, convert_parentheses=True) for sent in sents_new ]) return sents
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk.tokenize as nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from corenlp import CoreNLPClient interface = CoreNLPClient(annotators="tokenize ssplit".split()) else: raise Exception() """ if not args.split: sent_tokenize = lambda para: [para] """ source_path = in_path or os.path.join( args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] na = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') while True: try: temp = interface.annotate(context) break except Exception as e: time.sleep(0.2) context_s = [] for sent in temp.sentence: sent = [word.originalText for word in sent.token] # CoreNLP 가 처리못하는 단어인 \xa0 를 예외처리하기위한 수동 방법... for wi in range(len(sent)): if "\xa0" in sent[wi]: sent = sent[:wi] + sent[wi].split("\xa0") + sent[wi + 1:] wi = 0 context_s.append(sent) xi = context_s xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words while True: try: temp = interface.annotate(qa['question']).sentence[0] break except Exception as e: time.sleep(0.2) #print(temp.token[0]) #exit(-1) qi = [t_s.originalText for t_s in temp.token] print(qi) exit(-1) qi = process_tokens(qi) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
class Featurizer(): def __init__(self, annotators=None, properties=None): with open( os.path.join(os.path.dirname(__file__), "assets", "regexes.json")) as f: self.regexer = RegexFeaturizer(json.load(f)) self.annotators = annotators or CORENLP_ANNOTATORS self.properties = properties or CORENLP_PROPERTIES self.client = CoreNLPClient(self.annotators, properties=self.properties, endpoint="http://localhost:9012") def __enter__(self): self.client.__enter__() return self def __exit__(self, *args): self.client.__exit__(*args) def _apply_features(self, obj, ann=None): """ Adds features to a graph. """ if "features" not in obj: obj["features"] = {} if ann: assert len(ann.sentence) == 1 sentence = ann.sentence[0] assert len(sentence.token) == len(obj["tokens"]) obj["features"]["lemma"] = [t.lemma for t in sentence.token] obj["features"]["pos"] = [t.pos for t in sentence.token] obj["features"]["ner"] = [t.ner for t in sentence.token] obj["features"]["depparse"] = _dep_to_list( sentence.enhancedPlusPlusDependencies) assert len({tail for _, tail, _ in obj["features"]["depparse"] }) == len(sentence.token) child_to_head, head_to_child, path_length, next_in_path, distance_to_next_token, distance_from_prev_token = compute_dependency_paths( obj) obj["features"]["dep_child_to_head"] = child_to_head obj["features"]["dep_head_to_child"] = head_to_child obj["features"]["dep_path_lengths"] = path_length obj["features"]["dep_traceback"] = next_in_path obj["features"]["dep_dist_to_next"] = distance_to_next_token obj["features"]["dep_dist_from_prev"] = distance_from_prev_token if self.regexer: obj["features"]["regexes"] = self.regexer.featurize(obj["tokens"]) obj["features"][ "typed_values"] = self.regexer.featurize_unit_spans( obj["tokens"]) def featurize_graph(self, obj): ann = self.client.annotate(" ".join(obj["tokens"])) self._apply_features(obj, ann) return obj def featurize_text(self, text): ann = self.client.annotate(text) assert len(ann.sentence) == 1 sentence = ann.sentence[0] obj = { "tokens": [t.word for t in sentence.token], } self._apply_features(obj, ann) return obj