def pcfg(train_idx=None, smoothing=None): """ productions = [] item = treebank._fileids[0] print("ITEM\n\n",item,"\n\n") for tree in treebank.parsed_sents(item)[:3]: # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) productions += tree.productions() """ if train_idx == None: train_idx = (len(treebank.fileids()) * 3) // 4 productions = [] for item in treebank.fileids()[0:train_idx]: for tree in treebank.parsed_sents(item): tree.collapse_unary( collapsePOS=False) # Remove unary production rule tree.chomsky_normal_form( horzMarkov=2 ) # Convert into chomsky normal form i.e., A->(B,C,D) into A->(B,E) E->(C,D) productions += tree.productions() S = Nonterminal('S') if smoothing == None: grammar = learn_pcfg(S, productions) elif smoothing == 'L1': grammar = smoothing_pcfg(S, productions) with open('grammar.pkl', 'wb') as f: pickle.dump(grammar, f) return grammar
def main(): # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0]) # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw() # print("Induce PCFG grammar from treebank data:") # productions = [] print(len(treebank.fileids())) for item in treebank.fileids(): # Goes through all trees for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # # # # print(type(productions[0])) # # S = Nonterminal('S') grammar = induce_pcfg(S, productions) # # # print(grammar) # This is a PCFG # pickle.dump(grammar, open("tbank-grammar.p", "wb")) # t = time.time() # grammar = pickle.load(open("tbank-grammar.p", "rb")) # textf = open("lexicon.txt", "w") # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n")))))) # textf.close() # print(time.time()-t) parser = ViterbiParser(grammar) # pickle.dump(parser, open("cky-parser.p", "wb")) # parser = pickle.load(open("cky-parser.p", "rb")) parser.trace(0) sent = "John will join the board" tokens = sent.split() try: grammar.check_coverage(tokens) print("All words covered") parses = parser.parse_all(tokens) if parses: lp = len(parses) print(lp) print(parses[0].label()) # parses[0].draw() p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0) else: p = 0 print("Probability:", p) except: print("Some words not covered")
def sentences(): for f in treebank.fileids(): for t in treebank.parsed_sents(f): t.chomsky_normal_form(horzMarkov=1) t.collapse_unary(collapsePOS=True) yield (t, t.leaves())
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.fileids()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*' * 70)
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk import stem from nltk.corpus import treebank stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.fileids()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = " ".join(stemmed) results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip() # Convert the original to a string, and word wrap it. original = " ".join(orig) original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip() # Print the results. print("-Original-".center(70).replace(" ", "*").replace("-", " ")) print(original) print("-Results-".center(70).replace(" ", "*").replace("-", " ")) print(results) print("*" * 70)
def test_GrammarParser(): import nltk from nltk.corpus import treebank grammar = r"""NP: {<DT>*(<NN>|<NNP>|<NNS>)+} # Chunk everything }<VBD|IN>+{ # Chink sequences of VBD and IN """ # tree=treebank.parsed_sents('wsj_0001.mrg')[0] # print tree grammar_VP = r"""VP: {<VBZ><VP>} """ # tree=nltk.RegexpParser(grammar).parse(treebank.parsed_sents('wsj_0001.mrg')[0].pos()) # print tree fileids = treebank.fileids() # for fileld in fileids: for i in range(len(fileids)): if i > 10: break # trees=treebank.parsed_sents(fileld) trees = treebank.parsed_sents(fileids[i]) for tree in trees: tree_Gram = nltk.RegexpParser(grammar).parse(tree) for subtree in tree_Gram.subtrees(): if subtree.label() == "VP": print subtree
def gen_corpus(path, threshold): """ src: http://www.nltk.org/_modules/nltk/tree.html corpora from wsj_0001.mrg to wsj_0199.mrg e.g.: t = treebank.parsed_sents('wsj_0001.mrg')[0] to visualize a tree: t.draw() :param path: save to path :param threshold: minimum length of a sentence to keep :return: none """ boundaries = [] sentences = [] for t in treebank.parsed_sents(treebank.fileids()): flat = _flatten_tree(t, threshold) if flat: boundaries.append(flat) sentence = ' '.join(t.leaves()).translate(PUNC_TRANS).lower() sentence = re.sub(r' +', ' ', sentence) # replace digit(s) as 'x'(s) sentences.append(re.sub(r'\d', 'x', sentence).strip()) _check_length_match(boundaries, sentences) with open(path + "/boundaries.txt", 'w') as f: f.write('1'.join(boundaries)) with open(path + "/sentences.txt", 'w') as f: f.write(' '.join(sentences))
def trial_run_with_treebank(): files = treebank.fileids() files = files[:100] # make shorter for setup gram = makeGrammarFromTreebank(files) myparser = cky_parser.ckyparser(gram,Nonterminal('S')) chart,mytrees = myparser.probabilistic_parse_from_sent("I saw John with my telescope") print(mytrees)
def load_data(self, percentage): print("Started Loading the Data") # Get the complete data data_set = treebank.fileids() # Partition the data into train and test data sets training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)] testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)] # How much percentage of files consider for training? index = int(percentage*len(training_data_fileIds)) training_data_fileIds = training_data_fileIds[:index] tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds) tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds) tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds) tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds) # print(len(tagged_training_data1), len(tagged_testing_data1)) # UnTag the data for other uses untagged_training_data = [untag(item) for item in tagged_training_data] untagged_testing_data = [untag(item) for item in tagged_testing_data] print("Data Loaded Successfully. Stats are") print("Training Data Sentences: ", len(tagged_training_data)) print("Testing Data Sentences: ", len(tagged_testing_data)) return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
def read_split_treebank(): fileids = treebank.fileids() seed.shuffle(fileids) split = int(len(fileids) * 0.8) x_train = fileids[:split] x_test = fileids[split:] print('Train - Test: ', len(x_train), len(x_test)) return x_train, x_test
def read_treebank(input_vocab_size=10000, output_vocab_size=10000, seq_len=10): all_sents = [] for fname in treebank.fileids(): sents = treebank.sents(fname) if sents: all_sents.extend(sents) return read_dataset(all_sents, input_vocab_size, output_vocab_size, seq_len)
def _induce_grammar(self): self.productions = [] for tree in treebank.parsed_sents(treebank.fileids()): # perform optional tree transformations, e.g.: tree.collapse_unary( collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form( horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D self.productions += tree.productions()
def read_data(): treebank_tagged_sents = list( chain(*[[tree.pos() for tree in treebank.parsed_sents(pf)] for pf in treebank.fileids()])) words_list = [[tag[0] for tag in sent] for sent in treebank_tagged_sents] labels = [[tag[1] for tag in sent] for sent in treebank_tagged_sents] words = [] max_words = 0 for sent in words_list: words.extend(sent) max_words = max(max_words, len(sent)) print("Max. Words:", max_words) seq_length = 100 print("Seq. Length:", seq_length) words = list(set(words)) print("Number of Words:", len(words)) unique_labels = [] for sent in labels: unique_labels.extend(sent) unique_labels = list(set(unique_labels)) print("Number of Unique Labels:", len(unique_labels)) word2id = {word: i + 1 for i, word in enumerate(words)} id2word = {i + 1: word for i, word in enumerate(words)} X_data = [] Y_data = [] for i in range(len(treebank_tagged_sents)): for j in range(len(words_list[i])): _x = [0] * max_words for k in range(j + 1): _x[j - k] = word2id[words_list[i][k]] _x = _x[:seq_length] _x.reverse() X_data.append(_x) Y_data.append(one_hot(labels[i][j], unique_labels)) X_data = np.array(X_data, dtype=np.int32) Y_data = np.array(Y_data, dtype=np.float32) print(X_data.shape) print(Y_data.shape) return X_data, Y_data, unique_labels, words, word2id, id2word
def get_training_and_test_split(config): files = treebank.fileids() random.shuffle(files) training = files[:int(0.9 * len(files))] test = files[int(0.9 * len(files)):] with open(config.train_set, 'w') as file: file.write(",".join(training)) with open(config.test_set, 'w') as file: file.write(",".join(test))
def getGrammar(): fileid = treebank.fileids() trainfiles = fileid[:160] #testfiles=fileid[0.8*len(fileid):] productions = [] for item in trainfiles: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary( collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form( horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() lhs_prod = [p.lhs() for p in productions] rhs_prod = [p.rhs() for p in productions] set_prod = set(productions) list_prod = list(set_prod) token_rule = [] for ele in list_prod: if ele.is_lexical(): token_rule.append(ele) set_token_rule = set(p.lhs() for p in token_rule) list_token_rule = list(set_token_rule) corr_list_token_rule = [] for word in list_token_rule: if str(word).isalpha(): corr_list_token_rule.append(word) continue #print(corr_list_token_rule) import nltk a = [] for tok in corr_list_token_rule: #lhs = nltk.grammar.Nonterminal('UNK') lhs = 'UNK' rhs = [u'UNK'] UNK_production = nltk.grammar.Production(lhs, rhs) lhs2 = nltk.grammar.Nonterminal(str(tok)) a.append(nltk.grammar.Production(lhs2, [lhs])) token_rule.extend(a) list_prod.extend(a) S = Nonterminal('S') grammar = induce_pcfg(S, list_prod) return grammar
def ptb_wsj_set(): trainsize = 3500 testsize = 300 ts = list( itertools.chain(*(treebank.parsed_sents(fid) for fid in treebank.fileids())))[:trainsize + testsize] testsents = [" ".join(s.leaves()) for s in ts[trainsize:]] # with open("/tmp/trees.json", 'w') as f: # lins = [] # for t in ts[:trainsize]: # def lin(t): # if isinstance(t, str): # label = t # cs = [] # else: # label = t.label() # cs = t # # if '"' in label: # raise Exception("»\"« occuring in data! :O") # # return "{\"label\": \"" + label + "\", \"children\": [" + ",".join([lin(c) for c in cs]) + "]}" # lins.append(lin(t)) # print("[", ",\n".join(lins), "]", file = f) # # with open("/tmp/test.txt", 'w') as f: # for s in testsents: # print(s, file = f) ts = ts[:trainsize] def parse_treebank_grammar(treat_nt): def getrules(t): lhs = treat_nt(t.label()) rhs = [] for x in t: if isinstance(x, str): rhs.append(x) else: rhs.append(treat_nt(x.label())) for r in getrules(x): yield r yield (lhs, tuple(rhs)) return normalize_rules(itertools.chain(*(getrules(t) for t in ts))) rules, ntdict = intify_prules(parse_treebank_grammar) #test = ["the old man", "something was .", "something was at the stock market today .", "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 ."] return (rules, ntdict), testsents
def greedy_backward_selection(): extractor = funtag_features.extract_features # funtag_features.availbale_features = [] best_f_score = 0.0 all_features = ['label','head_pos','head','yeild','alt_head','alt_pos', 'parent_labels'] # , 'grandmother_label','sister_labels','sister_poss','sister_head'] best_features = list(all_features) td_files, test_files = train_test_split(treebank.fileids(), train_size=0.8, random_state=0) train_files, dev_files = train_test_split(td_files, train_size=0.8, random_state=0) while True: f_score_feature = [] for f in all_features: # funtag_features.availbale_features.append(f) if f in best_features: best_features.remove(f) X_train, Y_train = read_treebank_files(train_files, extractor, best_features) classifier = train_scikit_classifier(X_train, Y_train) X_eval, Y_eval = read_treebank_files(dev_files, extractor, best_features) Y_out = classifier.predict(X_eval) print('Done training and evaluating with features:', best_features) stats = {'*ALL*': PRF()} all_tags = sorted(set(t for ts in (Y_eval + list(Y_out)) for t in to_taglist(ts))) for tag in all_tags: stats[tag] = PRF() compute_statistics(Y_eval, Y_out, stats) f_score_feature.append((stats['*ALL*'].f1(),f)) best_features.append(f) if best_f_score < max(f_score_feature)[0] : best_f_score = max(f_score_feature)[0] best_features.remove(max(f_score_feature)[1]) all_features.remove(max(f_score_feature)[1]) else: break if len(all_features) == 0: break print('Done with greedy_forward_selection..... ') print('Features are: ', best_features) print('Overall statistics:') print(' #corr #guess #true precision recall F-score') print(' {0: >4} {1: >4} {2: >4} {3:0.4f} {4:0.4f} {5:0.4f}'.format(*stats['*ALL*'].all())) print('Statistics for each function tag:') print('tag #corr #guess #true precision recall F-score') for label in all_tags: print('{0} {1: >4} {2: >4} {3: >4} {4:0.4f} {5:0.4f} {6:0.4f}'.format(label, *stats[label].all()))
def get_trees_sentences(): trees = [] sentences = [] for file in treebank.fileids(): for tree in treebank.parsed_sents(file): tree_str = str(tree) trees.append(tree_str) for sentence in treebank.sents(file): s = "" s = " ".join(words for words in sentence) sentences.append(s) assert len(trees) == len(sentences) sentences = list(map(lambda x: x.lower(), sentences)) return (trees, sentences)
def test_Phrase(): import nltk from nltk.corpus import treebank fileids = treebank.fileids() grammar = r""" ADVP:{<RB>(<CC>*<RB>*|<JJ>*)} {} """ for fileld in fileids: sents = treebank.tagged_sents(fileld) for sent in sents: tree_Gram = nltk.RegexpParser(grammar).parse(sent) for subtree in tree_Gram.subtrees(): if subtree.label() == "ADVP": print subtree
def train(): files = tb.fileids() data = list(tb.parsed_sents(files)) # 80:20 split split = int(len(data) * 0.8) train_data = data[:split] test_data = data[split:] P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg( train_data) total_precision = 0 toal_recall = 0 total_f1_score = 0 i = 0 for test in test_data: print('Test', i) i += 1 try: words = test.leaves() scores, backs = cky_parsing(words, copy(P_grammar), copy(P_non_terms), copy(P_vocab), copy(P_term_parents), copy(P_parents_count)) start = Tree(Nonterminal('S'), []) if scores[0][len(words)][Nonterminal('S')] == 0: start = get_start(scores, len(words)) predicted_tree = build_tree(start, 0, len(words), backs, P_non_terms) clean_tree(predicted_tree) predicted_tree.un_chomsky_normal_form() precision, recall, f1_score = evaluate(words, predicted_tree, test) print(precision, recall, f1_score) total_precision += precision toal_recall += recall total_f1_score += f1_score except: print('***************Failed', i - 1) continue total_precision /= len(test_data) toal_recall /= len(test_data) total_f1_score /= len(test_data) print('Precision', total_precision) print('Recall', toal_recall) print('F1_score', total_f1_score)
def getProductions(pos): from nltk.corpus import treebank fileids = treebank.fileids() pos_ProductionList = [] for fileld in fileids: trees = treebank.parsed_sents(fileld) for tree in trees: productions = tree.productions() for production in productions: if str(production.lhs()) == pos: pos_ProductionList.append(production) c_POS = Counter(pos_ProductionList) c_POS_sorted = sorted(c_POS.iteritems(), key=lambda asd: asd[1], reverse=True) print c_POS_sorted
def PCFG_Section(): toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print('A PCFG production:', pcfg_prod) print('pcfg_prod.lhs() =>', pcfg_prod.lhs()) print('pcfg_prod.rhs() =>', pcfg_prod.rhs()) print('pcfg_prod.prob() =>', pcfg_prod.prob()) # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] for item in treebank.fileids()[:2]: for tree in treebank.parsed_sents(item): # print(" ".join(tree.leaves())) # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D prods = tree.productions() # print(prods[0].prob()) productions += prods S = Nonterminal('S') grammar = induce_pcfg(S, productions) # print(grammar) # This is a PCFG ### Parsing section below ### print("\nParse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(1) sent = treebank.parsed_sents('wsj_0001.mrg')[0] print(sent.prob())
def test_VBPenn(): from nltk.corpus import treebank fileids = treebank.fileids() VBContexts = [] for i in range(len(fileids)): sentPOSList = treebank.tagged_sents(fileids[i]) for sentPOS in sentPOSList: # print 'sentPOS',sentPOS VBContext = getVBContext(sentPOS) # print 'VBContext',VBContext if len(VBContext) != 0: POSList = '' for wordPOS in VBContext: # print 'wordPOS',wordPOS POSList += wordPOS[1] + '+' VBContexts.append((POSList, VBContext)) print VBContexts
def get_words(): """ Returns list of words from nltk treebank """ import nltk nltk.download("treebank") from nltk.corpus import treebank word_ls = [] for item in treebank.fileids(): for (word, tag) in treebank.tagged_words(item): # assuming the words are allready lowered word = word.lower() word_ls.append(word) word_ls = list(set(word_ls)) return word_ls
def get_trees(fileids=None, verbose=False): """ Get the CNF trees for the treebank fileids given, or for the entire treebank """ if not fileids: # Get the Penn Treebank corpus fileids = treebank.fileids() # Get the sentence-trees in each file tree_lists = [treebank.parsed_sents(file_id) for file_id in fileids] trees = [sent for sent_list in tree_lists for sent in sent_list] if verbose: print("obtained", len(trees), "trees from the corpus.") cnf_trees = [ctc.convert_tree(t) for t in trees] if verbose: print("converted", len(trees), "trees to cnf.") return cnf_trees
def make_PCFG_grammar(): ''' Forms a PCFG grammar utilizing the first 1963 files in the WSJ treebank ''' # Save a list of all produced PCFG rules given the tested data PCFG_rules = [] for item in treebank.fileids()[:1964]: # We want to first get rid of all non-binary branchings of the tree for tree in treebank.parsed_sents(item): tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) PCFG_rules += tree.productions() # Induce the PCFG grammar S = Nonterminal('S') PCFG_grammar = induce_pcfg(S, PCFG_rules) return PCFG_grammar
def parse(sent): files = tb.fileids() data = list(tb.parsed_sents(files)) P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg( data) words = casual_tokenize(str(sent)) scores, backs = cky_parsing(words, copy(P_grammar), copy(P_non_terms), copy(P_vocab), copy(P_term_parents), copy(P_parents_count)) start = Tree(Nonterminal('S'), []) if scores[0][len(words)][Nonterminal('S')] == 0: start = get_start(scores, len(words)) predicted_tree = build_tree(start, 0, len(words), backs, P_non_terms) clean_tree(predicted_tree) predicted_tree.un_chomsky_normal_form() print('Parsed Tree') print(predicted_tree)
def buildpcfg(): # call this function once then pickle the PCFG productions = [] for item in treebank.fileids(): for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() ''' this just overwrites everything to 'X' apparently for production in productions: #rewrite every production to use the universal tagset if isinstance(production.lhs(), Nonterminal) and production.lhs().symbol() != 'S': production._lhs = Nonterminal(tag.map_tag('en-ptb','universal',production.lhs().symbol())) for elem in enumerate(production.rhs()): if isinstance(elem, Nonterminal): production._rhs[elem] = Nonterminal(tag.map_tag('en-ptb','universal', production.rhs()[elem].symbol())) ''' S = Nonterminal('S') grammar = induce_pcfg(S, productions) # induce PCFG from productions learned in treebank # print(grammar) return grammar
def test_PennCorpus(): from nltk.corpus import treebank fileids = treebank.fileids() ADJP_ProductionList = [] ADVP_ProductionList = [] NP_ProductionList = [] VP_ProductionList = [] for fileld in fileids: trees = treebank.parsed_sents(fileld) for tree in trees: productions = tree.productions() for production in productions: if str(production.lhs()) == 'ADJP': ADJP_ProductionList.append(production) if str(production.lhs()) == 'ADVP': ADVP_ProductionList.append(production) if str(production.lhs()) == 'NP': NP_ProductionList.append(production) if str(production.lhs()) == 'VP': VP_ProductionList.append(production) print len(ADJP_ProductionList), len(ADVP_ProductionList), len( NP_ProductionList), len(VP_ProductionList) c_ADJP = Counter(ADJP_ProductionList) c_ADVP = Counter(ADVP_ProductionList) c_NP = Counter(NP_ProductionList) c_VP = Counter(VP_ProductionList) c_ADJP_sorted = sorted(c_ADJP.iteritems(), key=lambda asd: asd[1], reverse=True) c_ADVP_sorted = sorted(c_ADVP.iteritems(), key=lambda asd: asd[1], reverse=True) c_NP_sorted = sorted(c_NP.iteritems(), key=lambda asd: asd[1], reverse=True) c_VP_sorted = sorted(c_VP.iteritems(), key=lambda asd: asd[1], reverse=True)
def train_pcfg(): print 'training grammar' productions = [] # print len(treebank.fileids()) trees = [] # up to 199 less for shorter grammar for quicker training for fileid in treebank.fileids()[0:20]: for tree in treebank.parsed_sents(fileid): # perform optional tree transformations, e.g.: # Remove branches A->B->C into A->B+C so we can avoid infinite # productions tree.collapse_unary(collapsePOS=False) # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser) # horizontal and vertical Markovization: remember parents and siblings in tree # This gives a performance boost, but makes the grammar HUGE # If we use these we would need to implement a tag forgetting method #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0) tree.chomsky_normal_form() productions += tree.productions() S = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(S, productions) print "grammar trained!" return grammar
def main(): # train = treebank.fileids()[:190] test = treebank.fileids()[190:] # 10 last sentences # original grammar # pcfg = induce_grammar(train) # pickle.dump(pcfg, open("grammar.pcfg", 'wb')) # load grammar # pcfg : PCFG = pickle.load(open("grammar.pcfg", 'rb')) # fill in missing words # missing_words = get_missing_words(pcfg, test) # pcfg_unk = fill_missing_words(pcfg, missing_words) # pickle.dump(pcfg_unk, open("grammar_unk.pcfg", 'wb')) # load unk grammar pcfg_unk: PCFG = pickle.load(open("grammar_unk.pcfg", 'rb')) # use unk grammar on test sentences parser = ViterbiParser(pcfg_unk) parse_treebank(parser, test)
def lab1(): # set this to your own feature extraction function extractor = funtag_features.extract_features # We reserve some treebank files for testing purposes. # This shouldn't be touched until you have optimized your # results on the development set. td_files, test_files = train_test_split(treebank.fileids(), train_size=0.8, random_state=0) # Split the rest into a training and a development set. train_files, dev_files = train_test_split(td_files, train_size=0.8, random_state=0) print('Reading training trees from treebank...') X_train, Y_train = read_treebank_files(train_files, extractor,[]) print('Training classifier...') classifier = train_scikit_classifier(X_train, Y_train) print('Done training.') print('Reading evaluation trees from treebank...') X_eval, Y_eval = read_treebank_files(dev_files, extractor,[]) # When you have optimized your system for the development set, # you can evaluate on the test set. #X_eval, Y_eval = read_treebank_files(test_files, extractor, []) print('Running classifier on evaluation data...') Y_out = classifier.predict(X_eval) print_stats(Y_eval, Y_out)
def perplexity(): ''' Give the PCFG and the parser used, run the parser on the rest of the treebank and calculates the perplexity of the model given the testing sentences. ''' PCFG_grammar = make_PCFG_grammar() parser = ViterbiParser(PCFG_grammar) all_p = [] skipped_sentence = 0 for item in treebank.fileids()[1964:]: trees = treebank.parsed_sents(item) for tree in trees: tree = tree.leaves() try: PCFG_grammar.check_coverage(tree) for parse in parser.parse(tree): parse_string = str(parse) p = re.search(r"p=([^/]+)", parse_string).group(1) p = p[:-1] all_p.append(float(p)) except: skipped_sentence += 1 continue perplexity = 1 N = float(len(all_p)) for p in all_p: perplexity = perplexity * (1/p) perplexity = pow(perplexity, 1/float(N)) print("Perplexity:", perplexity) print("All parse probabilities:", all_p) print("Skipped sentences:", skipped_sentence) print("PCFG grammar:", PCFG_grammar)
def main(): #getting all treebank files files = treebank.fileids() #grammar will be stored in dict grammar = {} for file in files: trees = treebank.parsed_sents(file) for tree in trees: grammar = traverse_tree(tree, grammar) cfg = get_grammar_series(grammar) cfg.to_csv("cfg_instances.csv") cfg.to_pickle('cfg_pickle.pickle') pcfg = get_probabilities(grammar) pcfg_series = get_grammar_series(pcfg) pcfg_series.to_csv('problem_1_pcfg.csv')
files = [] def find_pronouns(tree): pronouns = [] for child in tree: if type(child) in [unicode, str] and child.lower() in PRONOUNS: pronouns.append((child.lower(), None)) if isinstance(child, ParentedTree): pronouns = pronouns + find_pronouns(child) return pronouns total = 0 for file in treebank.fileids(): stats['name'] = file for tree in treebank.parsed_sents(file): tree = ParentedTree.convert(tree) for pronoun, np_node in find_pronouns(tree): if pronoun in gendered: stats['gendered'] += 1 if pronoun in itits: stats['itits'] += 1 stats['total'] += 1 total += 1 stats['pct_gendered'] = stats['gendered']/float(stats['total']) print file, total files.append(stats.copy())
from nltk import ParentedTree, corpus from nltk.corpus import treebank, names import Queue from collections import deque from sys import argv FILENAMES = [file for file in treebank.fileids()[:20]] PRONOUNS = {'he': 'male', 'him': 'male', 'his': 'male', 'himself': 'male', 'she': 'female', 'her': 'female', 'hers': 'female', 'herself': 'female', 'they': 'plural', 'them': 'plural', 'their': 'plural', 'theirs': 'plural', 'themselves': 'plural', 'it': 'singular', 'its': 'singular', 'itself': 'singular'} PRONOUN_RESULTS = {'male': 0, 'male_total':0, 'male_pct':0, 'female': 0, 'female_total': 0, 'female_pct':0,'neutral': 0, 'neutral_total': 0, 'neutral_pct':0, 'they': 0, 'they_total': 0, 'they_pct':0, 'reflexive': 0, 'reflexive_total': 0, 'reflexive_pct':0,} NAMELIST = ([(name, "male") for name in names.words("male.txt")] + [(name, "female") for name in names.words("female.txt")]) NOMINALS = {'NN': 'singular', 'NNS': 'plural', 'NNP': 'singular', 'NNPS': 'plural', 'PRP': 'singular'} def update_pronoun_results(pronoun, correct): if pronoun in ['he', 'him', 'his', 'himself']: if correct: PRONOUN_RESULTS['male'] += 1 else: PRONOUN_RESULTS['male_total'] += 1
etiquette. """ return [(word, "NN") for word in sent] # EVALUATION import nltk from nltk.corpus import treebank training_size = 150 development_size = 25 training_data = list(treebank.tagged_sents(treebank.fileids()[:training_size])) development_data = list(treebank.tagged_sents(treebank.fileids()[training_size : training_size + development_size])) def flatten(lol): """A partir d'une liste des listes des valeurs, produit une liste des simple valeurs en concatenant les listes.""" return [value for l in lol for value in l] def filter_text(corpus): return [[(word, tag) for (word, tag) in sent if tag != '-NONE-'] \ for sent in corpus] def measure_accuracy(): """Mesure l'efficacite de votre approche en comptant la proportion des mots pour lesquels la bonne etiquette ete recuperee par votre algorithme."""
if mytree: print("parsed!", mytree) original_trees += [tree] resulting_trees += [mytree] else: print("didn't parse") unsuccessful_parses += 1 if original_trees: # If we got some successful parses, evaluate these: accuracy = evalb.evalb(resulting_trees, original_trees) count = len(original_trees) if verbose: print("In fold", i, "we parsed", count, "sentences with an accuracy of", accuracy) # Update the count and averages avg_accuracy = (avg_accuracy*parsed_num + accuracy*count) parsed_num += count avg_accuracy /= parsed_num unparsed_num += unsuccessful_parses # Aggregate results. if verbose: print("We parsed", parsed_num, "sentences with an accuracy of", avg_accuracy) print("We could not parse", unparsed_num, "sentences") return avg_accuracy print(cross_validate(fileids=treebank.fileids()[:25], verbose=True))
else: for i in tree: topresent(i, tree) def halflemmatize(word): if word in ['fell', 'fallen']: return 'fall' if word in ['is', 'was']: return 'is' elif word in ['are', 'were', '\'re']: return 'are' elif word in ['\'m', 'am']: return 'am' elif word in ['\'S', '\'s']: return 'is' out = wnl.lemmatize(word.lower(), 'v') return out if __name__ == "__main__": count = 0 for file in treebank.fileids(): for i in treebank.parsed_sents(file): print(i.leaves()) if i.leaves()[0] == 'These' and i.leaves()[1] == 'three': print(i) tofuture(i, Tree('None', [])) print(i.leaves()) count += 1