def main(transform_func = None, n = 10): parser=StanfordParser( path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar", path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar", model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) test_sents = treebank.sents()[-n:] print "len(test_sents) = %d" %(len(test_sents)) if transform_func and callable(transform_func): print "transforming it using ", transform_func test_sents = [[transform_func(w) for w in s] for s in test_sents] # transform it print test_sents[:10] print "predicting" pred_parses = parser.parse_sents(test_sents) gold_parses = treebank.parsed_sents() print "evaluating" correct_n = gold_n = predicted_n = 0.0 for gparse, pparse in zip(gold_parses, pred_parses): cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), get_nodes_with_range(pparse)) correct_n += cn gold_n += gn predicted_n += pn print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
def parseThisSents(sentences): sen10ses = [] for i in sentences: sen10ses += [i.split()] parser = StanfordParser(path_to_models_jar=my_path_to_models_jar3, path_to_jar=my_path_to_jar3) # c = list(parser.parse_sents(sentences)) c = parser.parse_sents(sen10ses) A = list(c) phrasesList = [] for i in range(len(A)): for B in A[i]: ## print B for s in B.subtrees( lambda B: B.label() == 'NP' or B.label() == 'VP'): phrasesList += [(" ".join(s.leaves()))] return phrasesList
class SentenceCompress: def __init__(self): self.parser = StanfordParser() # takes an array of sentences (which are arrays of strings?) def syntax_parse(self, sentences): self.parsed_sentences = self.parser.parse_sents(sentences[:10]) # only testing w/ first 10 for list_iter in self.parsed_sentences: for t in list_iter: print(t) self.traverse_tree(t) def word_significance(self, w): # I_j(w_i) = # tf_ij x idf_i if w_i is verb or common noun # tf_ij x idf_i + omega if w_i is proper noun # 0 otherwise pass def traverse_tree(self, tree): for node in tree: print(node) if type(node) == Tree: self.traverse_tree(node)
class FeatureMaker: _sentence_data = None _split_data = None _stf_pos_tagger = None _stf_parser = None _pos_list = [] _neg_list = [] def __init__(self, data): self._split_data = data self._sentence_data = [" ".join(line) for line in self._split_data] def _pos_tag_sent(self, sent): # text = word_tokenize("And now for something completely different") return nltk.pos_tag(sent) def _sf_pos_tag_sent(self, sent): return self._stf_pos_tagger.tag(sent) def prefix_suffix(self): prefix_2 = [] prefix_3 = [] suffix_2 = [] suffix_3 = [] for line in self._split_data: prefix_2.append([w[:2] for w in line]) prefix_3.append([w[:3] for w in line]) suffix_2.append([w[-2:] for w in line]) suffix_3.append([w[-3:] for w in line]) return [prefix_2, prefix_3, suffix_2, suffix_3] def fast_pos_tag(self): tag_result = [[token[1] for token in self._pos_tag_sent(line)] for line in self._split_data] return tag_result def pos_tag(self): if self._stf_pos_tagger is None: self._stf_pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') index = 0 tag_result = [] while index < len(self._split_data): temp = self._stf_pos_tagger.tag_sents(self._split_data[index:index+1000]) tag_result.extend(temp) index += 1000 print(("pos:" + str(index)), end=' ') # tag_result = self._stf_pos_tagger.tag_sents(self._split_data) tag_result = [[unidecode(p[1]) for p in line] for line in tag_result] # for line in tag_result: # print str(line) return tag_result def parser(self): if self._stf_parser is None: self._stf_parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") result = self._stf_parser.parse_sents(self._split_data) result = sum([[parse for parse in dep_graphs] for dep_graphs in result], []) for i in result: print(i) def per_word_length(self): wl_result = [[len(w) for w in line] for line in self._split_data] return wl_result def sentence_avg_word_length(self): wl_result = self.per_word_length() wl_result = [np.mean(line) for line in wl_result] return wl_result def sentence_length(self): sl_result = [len(line) for line in self._split_data] return sl_result def sentence_length_mean_sd(self): return np.mean(self.sentence_length()), np.std(self.sentence_length()) def load_sentiment_list(self): if not self._pos_list: with open("./../pos_neg/positive-words.txt", mode='r') as f: file_content = f.readlines() for line in file_content: line = line.strip() if not line.startswith(";") and line: self._pos_list.append(line) if not self._neg_list: with open("./../pos_neg/negative-words.txt", mode='r') as f: file_content = f.readlines() for line in file_content: line = line.strip() if not line.startswith(";") and line: self._neg_list.append(line) return [self._pos_list, self._neg_list] def sentiment_sequence(self): sentiment_data = [] for line in self._split_data: sentiment_line = [] for word in line: if word in self._pos_list: sentiment_line.append("POS") elif word in self._neg_list: sentiment_line.append("NEG") else: sentiment_line.append("NON") sentiment_data.append(sentiment_line) return sentiment_data def get_read_measure(self): value_list = [] for cat, data in list(readability.getmeasures(self._sentence_data, lang='en').items()): print(('%s:' % cat)) for key, val in list(data.items()): print(((' %-20s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.'))) value_list.append(val) return val
def parse_key_file(key_file): try: from nltk.parse.stanford import StanfordParser parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", java_options='-Xmx8G') print("Starting to parse key_file!") print("This might take a while...") new_file = open(key_file + ".parsed", "w") with open(key_file) as f: tmp_sentence = [[]] tmp_conll_lines = [] for line in f: if line.startswith("#begin"): new_file.write(line) continue elif len(line.strip()) == 0 or (line.startswith("#end") and len(tmp_conll_lines) > 0): parse = parser.parse_sents(tmp_sentence) for tree in parse: for tree_line in tree: #line is a Tree parse_string = ' '.join(str(tree_line).split()) treecomp = parse_string.split() currlowestindex = 0 token_index = 0 for idx, val in enumerate(treecomp): if not val.startswith("("): firstindexofbracket = val.index(")") lastindex = val.__len__() - 1 tag_components = [] pos_tag = treecomp[idx - 1].replace( "(", "") if currlowestindex == idx - 1: if firstindexofbracket == lastindex: tag_components.append("*") else: parsecol = "*" + val[ firstindexofbracket:lastindex] tag_components.append(parsecol) else: for i in range(currlowestindex, idx - 1): tag_components.append(treecomp[i]) if firstindexofbracket == lastindex: tag_components.append("*") else: parsecol = "*" + val[ firstindexofbracket:lastindex] tag_components.append(parsecol) currlowestindex = idx + 1 new_file.write( '\t'.join(tmp_conll_lines[token_index]. split()[0:4]) + "\t" + pos_tag + "\t" + ''.join(tag_components) + '\t' + '\t'.join(tmp_conll_lines[token_index]. split()[4:]) + '\n') token_index += 1 tmp_sentence[0] = [] tmp_conll_lines = [] new_file.write("\n") elif not line.startswith("#"): word = line.split()[3] word_uc = word #.decode(encoding='UTF-8') tmp_sentence[0].append(word_uc) tmp_conll_lines.append(line) if line.startswith("#end"): new_file.write(line) except: print( "You need to set the CLASSPATH environment variable to point to the Stanford parser!" ) print( "Example: export CLASSPATH=/path/to/stanford-parser-full-YYYY-MM-DD/stanford-parser.jar:/path/to/stanford-parser-full-YYYY-MM-DD/stanford-parser-X.X.X-models.jar" ) print("") raise
def main(in_path, outpath): nltk.download() span_extractor = torch.load(os.path.join(EXPERIMENT, 'best_span_extractor.tar'), map_location='cpu') answer_verifier = torch.load(os.path.join(EXPERIMENT, 'best_answer_verifier.tar'), map_location='cpu') span_extractor.use_cuda = False answer_verifier.use_cuda = False tokenizer = StanfordTokenizer( options={'ptb3Escaping': True}) # same tokenizer used by lexical parser parser = StanfordParser(java_options='-mx5g') data = json.load(open(in_path, 'r'))['data'] batches = [] official_eval = {} official_eval_tokens = {} qaid_map = {} num_articles = len(data) for aidx in range(len(data)): article = data[aidx] print('\t- Article Count=%d/%d' % (aidx + 1, num_articles)) for pidx, paragraph in enumerate(article['paragraphs']): passage, qas = paragraph['context'], paragraph['qas'] passage = passage.replace(u'\xa0', ' ') sentences = sent_tokenize(passage) sentence_tokens = [ tokenizer.tokenize(sentence) for sentence in sentences ] raw_trees = [ list(s)[0] for s in list( parser.parse_sents(sentence_tokens, verbose=True)) ] squad_tree = TreePassage(raw_trees) for qidx, qa in enumerate(qas): question_sentences = sent_tokenize(qa['question']) question_tokens = [] for s in question_sentences: question_tokens += tokenizer.tokenize(s) batches.append( Batch([{ 'apid': 'apid', 'qa_id': qa['id'], 'context_squad_tree': squad_tree, 'question_tokens': question_tokens, 'answers': [], 'is_impossible': 0 }], False)) qaid_map[qa['id']] = paragraph['context'] span_extractor.eval() answer_verifier.eval() for idx, batch in enumerate(batches): qa_id = batch.qa_id[0] node_scores, expected_f1s, global_answer_score = span_extractor( batch, eval_system=True) score_confidence, predicted_node_idxs = node_scores.max(dim=1) score_confidence, predicted_node_idxs = (variable_to_numpy( score_confidence, False), variable_to_numpy(predicted_node_idxs, False)) # Answer score = predicted has answer probability answer_score = answer_verifier(batch, predicted_node_idxs=predicted_node_idxs, eval_system=True) answer_proba = variable_to_numpy( Sigmoid()(answer_score), False) # convert from tensor to numpy array global_answer_proba = variable_to_numpy(Sigmoid()(global_answer_score), False) has_answer_proba = (0.3 * score_confidence + 0.4 * global_answer_proba + 0.3 * answer_proba)[0] predicted_span = batch.trees[0].span(predicted_node_idxs[0]) predicted_has_answer = has_answer_proba >= HAS_ANSWER_THRESHOLD predicted_text = tokens_to_text(predicted_span, qaid_map[qa_id]) official_eval[qa_id] = predicted_text if predicted_has_answer else '' official_eval_tokens[qa_id] = ' '.join( predicted_span) if predicted_has_answer else '' json.dump(official_eval, open(outpath, 'w'))