SPECIAL_TOKENS = { "pad": "<pad>", "oov": "<oov>", "sos": "<sos>", "eos": "<eos>" } SPECIAL_TOKEN2ID = {"<pad>": 0, "<oov>": 1, "<sos>": 2, "<eos>": 3} # spaCy NLP = spacy.load("en") # prevent tokenizer split special tokens for special_token in SPECIAL_TOKENS.values(): NLP.tokenizer.add_special_case(special_token, [{ORTH: special_token}]) # benepar PARSER = benepar.Parser("benepar_en2") # glove GLOVE = gensim.models.KeyedVectors.load_word2vec_format(GLOVE_BIN_PATH, binary=True) # SBERT SBERT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens') # device DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # env EXP_PLATFORM = "others" # set it to be "venus" or any other string. This is just used for run experiments on Venus platform. print("Finished loading constants ...")
width += sub_width else: width += 1 if root.label() in labels: assert width == len(root.leaves()) chunks.append((offset, offset+width)) return width os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" print('Loading model...') parser = benepar.Parser("benepar_zh") line = "刘德华 偶尔 玩 下 由 腾讯 开发 的 王者荣耀 这 款 游戏 。 平时 , 经常 会 跟 小伙伴 通过 微信 聊聊天 。" line = line.strip().split() tree = parser.parse(line) print(str(tree)+'\n========') chunks = [] labels = ['NP',] line_len = extract_spans_recur(tree, 0, chunks, labels) assert len(line) == line_len for st, ed in chunks: print('\t'+' '.join(line[st:ed])) #tree_str = ' '.join(str(tree).split()) #print(tree_str)
def find_sentences_of_interest(train): """ train: the file name from sys.argv[1] Return: a list of sentence of interest, where each element is of structure: (sentence Tree(is a nltk.tree.tree), dict{NER:tag}, heuristic socre) for example: first element of the returned list acquired from set1/a1.txt is (Tree('S', [Tree('NP', [Tree('DT', ['the']), Tree('NNP', ['old']), Tree('NNP', ['kingdom'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['period'])]), Tree('PP', [Tree('PP', [Tree('IN', ['in']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['third']), Tree('NN', ['millennium'])])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']), Tree('NP', [Tree('.', ['c.']), Tree('CD', ['2686-2181']), Tree('NNP', ['bc'])]), Tree('-RRB-', ['-RRB-'])])]), Tree('VP', [Tree('ADVP', [Tree('RB', ['also'])]), Tree('VBN', ['known']), Tree('PP', [Tree('IN', ['as']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ["'age"])]), Tree('PP', [Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['the']), Tree('NNS', ['pyramids'])])]), Tree("''", ["'"])])]), Tree('CC', ['or']), Tree('NP', [Tree('NN', ["'age"])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['pyramid']), Tree('NNS', ['builders'])])]), Tree("''", ["'"])])]), Tree('SBAR', [Tree('IN', ['as']), Tree('S', [Tree('NP', [Tree('PRP', ['it'])]), Tree('VP', [Tree('VBZ', ['includes']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['great']), Tree('JJ', ['4th']), Tree('NNP', ['dynasty'])]), Tree('SBAR', [Tree('WHADVP', [Tree('WRB', ['when'])]), Tree('S', [Tree('S', [Tree('NP', [Tree('NNP', ['king']), Tree('NNP', ['sneferu'])]), Tree('VP', [Tree('VBD', ['perfected']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['art'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NN', ['pyramid']), Tree('NN', ['building'])])])])])]), Tree('CC', ['and']), Tree('S', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NNS', ['pyramids'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NNP', ['giza'])])])]), Tree('VP', [Tree('VBD', ['were']), Tree('VP', [Tree('VBN', ['constructed']), Tree('PP', [Tree('IN', ['under']), Tree('NP', [Tree('DT', ['the']), Tree('NNS', ['kings']), Tree('NNP', ['khufu']), Tree(',', [',']), Tree('NNP', ['khafre']), Tree(',', [',']), Tree('CC', ['and']), Tree('NNP', ['menkaure'])])])])])])])])])])])])])])]), Tree('.', ['.'])]), {'4th': 'ORDINAL', '2686-2181': 'DATE'}, 2) str = " ".join(sent.leaves()) should give you the whole sentence, where "sent" is the tree structure shown above """ #----------------------------------------read docs---------------------------------------------------------------- # d is dict where key is the title and value is a list of sentence d = dict() s = "" with open(train, encoding='utf-8', mode='r') as _f: for i, line in enumerate(_f): if i == 0: title = line.strip().lower() # delete content after "see also" elif line.strip().lower() in set(["see also", 'references']): break else: s += line.strip().lower() d[title] = sent_tokenize(s) #----------------------------------------parse tree---------------------------------------------------------------- # parse tree : select NP-VP structured sentence candidate = [] # parser = stanford.StanfordParser(model_path="C:/Users/geyiyang/OneDrive/CMU/2019 Spring/NLP/team project/QAProject/englishPCFG.ser.gz",encoding='utf8') parser = benepar.Parser("benepar_en2") # sentences = parser.raw_parse_sents(('Hello,My name is completely Melro.','Are you ok?')) # pdb.set_trace() for v in d.values(): # sentences = parser.raw_parse_sents(v) sentences = parser.parse_sents(v) for sentence in sentences: if sentence.label() == "S": # start for i in range(len(sentence) - 1): if sentence[i].label() == "NP" and sentence[ i + 1].label() == "VP": candidate.append( sentence ) #save this NP-VP sentence as a tree structure break #----------------------------------------TF-IDF---------------------------------------------------------------- Nones = set(["NN", "NNS", "NNP", "NNPS"]) #extract None # freq_dict = [] # t = [] # for sent in candidate: #sent is a tree # for word, tag in sent.pos(): # POS # if tag in Nones: # t.append(word) # none_len = len(t) # freq_dict = Counter(t) # dev_data = ['set1','set2','set3','set4','set5'] # return a tf_idf dict, word:score # pdb.set_trace() # tf_idf = computeTFIDF(none_len, freq_dict, dev_data) # js = json.dumps(tf_idf) # with open("tfidf.json",'w') as f: # f.write(js) with open("repo/QAProject/tfidf.json") as f: tf_idf = json.load(f) scores = [] #a list of tfidf score for every sentence for sent in candidate: #sent is a tree score = 0 for word, tag in sent.pos(): # POS if tag in Nones: score += tf_idf[word] scores.append(score) #----------------------------------------NER tag---------------------------------------------------------------- # NER = set(["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","EVENT",\ # "WORK_OF_ART","LAW","LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"]) #compute NER for all candidate sentences NER = {"PERSON", "ORG", "DATE", "GPE"} # pdb.set_trace() # heuristic weight for tf-idf and NER alpha, beta = 1, 1 candidate2 = [] selected = [] nlp = spacy.load("en_core_web_sm") for i, sent in enumerate(candidate): str = " ".join(sent.leaves()) x = nlp(str) # pprint([(X.text, X.label_) for X in x.ents]) for X in x.ents: label = X.label_ if label in NER: # contains NER tag that we want #each sentence is store as a triplet (sent tree, NER tag dict, score) candidate2.append([ sent, dict([(X.text, X.label_) for X in x.ents]), len(x.ents) ]) selected.append(i) break ner_socres = np.array([c[2] for c in candidate2]) ner_socres = np.exp(ner_socres) / sum(np.exp(ner_socres)) tfidf_scores = [scores[i] for i in selected] tfidf_scores = np.exp(tfidf_scores) / sum(np.exp(tfidf_scores)) for i in range(len(candidate2)): candidate2[i][2] = alpha * ner_socres[i] + beta * tfidf_scores[i] candidate2.sort(key=lambda x: x[2]) return candidate2
output_size = 1 dropout = 0 print_every = 1000 def extract_vp(parent): if parent.label() == "VP": return " ".join(parent.leaves()) for node in parent: if type(node) is nltk.Tree: if node.label() == "VP": return " ".join(node.leaves()) return "-" parser = benepar.Parser("benepar_en2") args = Arguments() model = load_model(args) engine = db_connect() create_table(engine) Session = sessionmaker(bind=engine) session = Session() methods = session.query(Method).all() for method in methods: description = method.description sentences = nltk.sent_tokenize(description) if len(sentences) > 0:
def main(): input_conll_file = sys.argv[1] benepar.download('benepar_en2') parser = benepar.Parser("benepar_en2") add_predconst(input_conll_file, parser)
import benepar import os fr_parser = benepar.Parser("benepar_fr") en_parser = benepar.Parser("benepar_en2") def getFredaoutput(tree): #print(tree.pretty_print()) if type(tree) == str: return " " + tree + " " elif len(tree) == 1: return getFredaoutput(tree[0]) res = "" for t in tree: res += getFredaoutput(t) res = "(" + res + ")" res = res.replace("(", " ( ") res = res.replace(")", " ) ") res = ' '.join(res.split()) return res def tellDiff(s1, s2): if len(s1) != len(s2): print("len!!!====") print(len(s1)) print(len(s2)) return False for i in range(len(s1)):
import benepar # In Python2, wrap sys.stdin and sys.stdout to work with unicode. if sys.version_info[0] < 3: import codecs import locale encoding = locale.getpreferredencoding() sys.stdin = codecs.getreader(encoding)(sys.stdin) sys.stdout = codecs.getwriter(encoding)(sys.stdout) if sys.version_info.major == 3: raw_input = input model = sys.argv[1] # maybe "benepar_en" parser = benepar.Parser(model) def parse(tokens, tags): sentence = list(zip(tokens, tags)) parse_raw, tags_raw, sentence = next(parser._batched_parsed_raw([(tokens, sentence)])) tree = parser._make_nltk_tree(sentence, tags_raw, *parse_raw) return tree while True: tokens = raw_input() tags = raw_input() tokens = tokens.split(' ') tags = tags.split(' ') tree = parse(tokens, tags)
def get_true_false_questions(text, num_questions): """ Get true/false questions for the specified text Args: • text: text for which to create questions • num_questions: number of questions to create Output: • question_answers_list: list of questions, where each entry is the question + answers for that question """ # load GPT2 (for generating false sequences) and BERT (for finding sentence similarity of our real sentence against # our fake sentence tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") # we'll use GPT2 to generate sentences # load BERT model model_BERT = SentenceTransformer('bert-base-nli-mean-tokens') # we'll use BERT to filter sentences based on similarity # load necessary NLP tools + parser nltk.download("punkt") nlp = spacy.load("en") benepar.download("benepar_en2") benepar_parser = benepar.Parser("benepar_en2") # clean + split text text = clean_text(text) cleaned_text = get_sentences(text) cleaned_text = [clean_text(x) for x in cleaned_text] # use parser to split sentences, remove last verb phrase or last noun phrase sentence_completion_dict = get_sentence_completions(cleaned_text) # get false sentences probability_true = 0.5 # probability that we'll add a True statement, rather than the False statement num_fake_sentences = 3 # number of (maximum) fake sentences that we'd like to create for each real partial sentence answer_choices = " (a) True (b) False" # define our answer choices question_answers_list = [] # list to hold our questions and answers for key_sentence in sentence_completion_dict: # get our partial sentence partial_sentences_list = sentence_completion_dict[key_sentence] # start creating false sentences false_sentences = [] print(f"The number of false sentences that we have for the keyword of ({key_sentence}) is: {len(partial_sentences_list)}") # loop through list of partial sentences for sentence in partial_sentences_list: # create our false sentences false_sents = generate_sentences(sentence, key_sentence, num_fake_sentences) false_sentences.extend(false_sents) print(f"After the for loop through the partial sentences, we have {len(false_sentences)} false sentences") for idx, false_sent in enumerate(false_sentences): # for each fake option, we now need to decide if we'll use a fake question or a real question # return the actual question if np.random.uniform() <= probability_true: question = f" (ANSWER: True) {key_sentence} : " + answer_choices + "\n" # e.g., "(Answer: True) : 2 + 2 = 4" # return the false sentence else: question = f" (ANSWER: False) {false_sent} : " + answer_choices + "\n" # e.g., "(Answer: False) : 2 + 2 = 5" # add question to question list question_answers_list.append(question) print(f"We have {len(question_answers_list)} questions in our list") if len(question_answers_list) >= num_questions: break # shuffle our questions random.shuffle(question_answers_list) # get the first "num_questions" values return question_answers_list[:num_questions]
def benepar_setup(): berkeley_parser = benepar.Parser("benepar_en2") return berkeley_parser
with open("{}.parsed.{}.{}".format(section, dep_type_ind, model_ind), "w") as f: for arc in dataset.arcs: f.write(' '.join(map(str, arc))) f.write('\n') print("finished {} {}".format(parser_ind, "heads")) with open("{}.parsed.{}.{}.labels".format(section, dep_type_ind, model_ind), "w") as f: for rel in dataset.rels: f.write(' '.join(map(str, rel))) f.write('\n') print("finished {} {}".format(parser_ind, "labels")) if parser_ind == 'benepar': sents = [line.split(' ') for line in lines] import benepar, nltk parser = benepar.Parser("benepar_en3") # nlp = spacy.load('en_core_web_md') # if spacy.__version__.startswith('2'): # nlp.add_pipe(benepar.BeneparComponent("benepar_en3")) # else: # nlp.add_pipe("benepar", config={"model": "benepar_en3"}) sents = [benepar.InputSentence(words=sent) for sent in sents] print(sents[0]) dts = parser.parse_sents(sents) results = [' '.join(str(item).split()) for item in dts] print(len(results)) # input = "{}.cdeps.{}".format(section, parser_ind) with open("{}.cdeps.{}".format(section, parser_ind), "w") as f: for item in results:
def __init__(self, model="benepar_en2"): self.parser = benepar.Parser(model)
def get_cloze_data(input_data, clause_extract=False, proc=None): if clause_extract: parser = benepar.Parser("benepar_en2") ner = spacy.load("en", disable=['parser', 'tagger']) tagger = spacy.load("en", disable=['parser', 'ner']) cloze_data = [] q_count = 0 c_count = 0 for item in tqdm(input_data, desc="cloze"): entry = {} entry['title'] = item["document"][0] paragraph = {} paragraph["context"] = ' '.join(item["document"]) qas = [] for sent in item['summary']: sent_doc = ner(sent) if clause_extract: try: clause = get_clause_v2(sent, parser) except Exception as e: continue for ent in sent_doc.ents: answer = ent.text question = None if clause_extract: for each in clause: if each.find(answer) != -1: question = each.replace( answer, entity_type_map[ent.label_], 1) break else: question = sent[:ent.start_char] + \ sent[ent.start_char:].replace(answer,entity_type_map[ent.label_], 1) if not question: continue answer_start = get_answer_start(answer, question, item['document'], tagger) if answer_start == -1: continue qas.append({ "question": question, "id": "%s_%d" % (item['uid'], q_count), "is_impossible": False, "answers": [{ "answer_start": answer_start, "text": answer, "type": ent.label_ }], "plausible_answers": [] }) q_count += 1 paragraph['qas'] = qas entry['paragraphs'] = [paragraph] cloze_data.append(entry) #if q_count > 10: # break c_count += 1 if c_count % 2000 == 0: print(proc, 'processing %d/%d ...' % (c_count, len(input_data))) if proc is not None: json.dump( cloze_data, open(os.path.join(data_dir, 'tmp_store_%d.json' % proc), 'w', encoding='utf-8')) print('Questions Number', q_count) return {"version": "v2.0", 'data': cloze_data}
import os, sys, json, time import benepar os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" print('Loading model...') parser = benepar.Parser("cn_roberta_aux") #parser = benepar.Parser("/data2/lfsong/exp.parsing/servc.chinese/cn_roberta_aux") print('Decoding...') fout = open('test_pred.txt', 'w') count = 0 st = time.time() for line in open(sys.argv[1], 'r'): tree = parser.parse(line.strip().split()) fout.write(str(tree)+'\n') count += 1 print('Decoding time for {} sentences: {}'.format(count, time.time() - st)) fout.close()