def create_files(d): reader = BracketParseCorpusReader(path, ".*parse") print >> sys.stderr, "Reader is created now" c = 0 #filetypes = "pos clean-sent aw.tw sense".split() filetypes = "pos clean-sent sense".split() files = map(lambda x: gzip.open("ontonotes.%s.gz" % x, 'w'), filetypes) for parse_file, sentids in d.viewitems(): parse_file = '/'.join(parse_file.split('/')[-4:]) sentences = reader.parsed_sents(parse_file) for sentid, triple in sentids.viewitems(): sentence = sentences[sentid] clean_sent_list = [] clean_pos_list = [] for word, p in sentence.pos(): if p != '-NONE-': if word in fix: word = fix[word] clean_sent_list.append(word) clean_pos_list.append(p) for w, tid, senseid in triple: t = clean_sent_list[tid] p = clean_pos_list[tid] w = w.replace('-', '.') mm = "line-{}\t{}\t{}\t{}\t{}\t{}\t{}".format(c, t, c, tid, p, w, tid) ss = "line-{}\t{}\t{}\t{}".format(c, t, w, senseid) print mm write2file(files, [clean_pos_list, clean_sent_list, [ss]]) c += 1 map(lambda f: f.close(), files)
def _load(self): # load vocab file self.vocab = OrderedDict() with open(self.vocab_file, encoding='utf-8') as vf: for line in vf.readlines(): line = line.strip() self.vocab[line] = len(self.vocab) # filter glove if self.pretrained_file != '' and os.path.exists(self.pretrained_file): glove_emb = {} with open(self.pretrained_file, 'r', encoding='utf-8') as pf: for line in pf.readlines(): sp = line.split(' ') if sp[0].lower() in self.vocab: glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]]) files = ['{}.txt'.format(self.mode)] corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files) sents = corpus.parsed_sents(files[0]) #initialize with glove pretrained_emb = [] fail_cnt = 0 for line in self.vocab.keys(): if self.pretrained_file != '' and os.path.exists(self.pretrained_file): if not line.lower() in glove_emb: fail_cnt += 1 pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300))) if self.pretrained_file != '' and os.path.exists(self.pretrained_file): self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0)) print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb))) # build trees for sent in sents: self.trees.append(self._build_tree(sent))
def make_dataset(self, corpus: str) -> Dataset: reader = BracketParseCorpusReader(*os.path.split(corpus), encoding=self.encoding, detect_blocks='sexpr') oracles = [DiscOracle.from_tree(t) for t in reader.parsed_sents()] examples = [make_example(x, self.fields) for x in oracles] return Dataset(examples, self.fields)
class Treebank: def __init__(self, corpus_file: str, lowercase: bool = True) -> None: self.corpus_file = corpus_file self.lowercase = lowercase self._reader = BracketParseCorpusReader(*os.path.split(corpus_file)) def parsed_sentences(self) -> Iterable[Tree]: if self.lowercase: return (self.lowercase_leaves(parsed_sent) for parsed_sent in self._reader.parsed_sents()) else: return self._reader.parsed_sents() @classmethod def lowercase_leaves(cls, tree): if isinstance(tree, str): return tree.lower() return Tree(tree.label(), [cls.lowercase_leaves(child) for child in tree])
def process_parse_annotation(): print >> sys.stderr, "Parsing started" reader = BracketParseCorpusReader(annotations_path, '.*parse') pos_set = set("NN VB RB JJ".split()) # word level pos tags for n, v, adv, adj. check_pos = lambda x: x in pos_set d = dd(lambda: count(0)) for fileid in reader.fileids(): #print fileid for sentence in reader.parsed_sents(fileid): for word, p in sentence.pos(): pos = p[0:2] if p != '-NONE-' and check_pos(pos): d[pos].next() print [(pos, c.next()) for pos, c in d.iteritems()]
def annotation_process(): d = get_inventory_info() annotated_files = find_files(annotations_path, "*.sense") pos_file = gzip.open('on.pos.gz', 'w') inst_num_dict = dd(lambda: count(1)) for num_processed, fn in enumerate(annotated_files): if num_processed % 1000 == 0: print >> sys.stderr, "{} files processed".format(num_processed) directory = os.path.dirname(fn) basename = os.path.basename(fn) reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse')) fileid = reader.fileids()[0] sentences = dict() parsed_sents = reader.parsed_sents(fileid) for line in open(fn): line = line.split() tw = line[3] onto_sense = line[-1] sent_id, tok_id = int(line[1]), int(line[2]) stuple = sentences.setdefault(sent_id, None) if stuple is None: sentence = parsed_sents[sent_id] clean_sent = [] clean_pos = [] for word, p in sentence.pos(): if p != '-NONE-': if word in fix: word = fix[word] clean_sent.append(word) clean_pos.append(p) sentences[sent_id] = (clean_sent, clean_pos) else: clean_sent, clean_pos = stuple lexicon_senses, version, ita = d[tw][onto_sense] w = tw.replace('-', '.') # following the convention of SemEval m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}" print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id, w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent)) pos_file.write("{}\n".format(clean_pos)) print >> sys.stderr, "{} files processed".format(num_processed)
def text2DGL(source_file, vocab_file, embed_file, word_dim): # vocab(stoi): {word : index} vocab = OrderedDict() with open(vocab_file, encoding='utf-8') as vf: for line in vf.readlines(): line = line.strip() vocab[line] = len(vocab) # enrich word embedding embedding = np.random.random((len(vocab), word_dim)) with open(embed_file, 'r', encoding='utf-8') as pf: for line in pf.readlines(): sp = line.split(' ') if sp[0].lower() in vocab: embedding[vocab[sp[0].lower()]] = np.array( [float(x) for x in sp[1:]]) # build dgl from file files = [source_file] corpus = BracketParseCorpusReader('{}'.format(""), files) sents = corpus.parsed_sents(files[0]) trees = [build_tree(sent, vocab) for sent in sents] return trees, embedding, vocab
tree = parser.generate_parse_tree(["fish", "people", "fish", "tanks"], root_tag="S", theta=args.theta) tree.draw() else: if args.validate: validation_fn = os.path.join(args.data, "validation.txt") open(validation_fn, "a").write(("h: {0} v: {1} theta: {2} " "maxTrain: {3} maxValid: {4}\n") .format(args.horizontal, args.vertical, args.theta, args.maxTrain, args.maxValid)) evaluator = hw5.Evaluator(["ROOT", "TOP"], ["''", "``", ".", ":", ","]) for gold in out_of_domain.parsed_sents(): sentence = map(unicode, gold.leaves()) if len(sentence) > args.maxValid: continue guess = parser.generate_parse_tree(sentence, theta=args.theta) guess.un_chomsky_normal_form() evaluator(guess, gold) print("F1 = {0}".format(evaluator.get_f1())) open(validation_fn, "a").write("out-of-domain: {0}\n" .format(evaluator.get_f1())) evaluator = hw5.Evaluator(["ROOT", "TOP"], ["''", "``", ".", ":", ","]) for gold in in_domain.parsed_sents(): sentence = map(unicode, gold.leaves()) if len(sentence) > args.maxValid: continue
class PtbParser(object): """Gold parser that gets annotations from the PTB. It uses an instantiated NLTK BracketedParseCorpusReader for the PTB section relevant to the RST DT corpus. Note that the path you give to this will probably end with something like `parsed/mrg/wsj` """ def __init__(self, corpus_dir): """ """ self.reader = BracketParseCorpusReader(corpus_dir, r'../wsj_.*\.mrg', encoding='ascii') def tokenize(self, doc): """Tokenize the document text using the PTB gold annotation. Return a tokenized document. """ # get tokens from PTB ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc.text tagged_tokens = self.reader.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] # store in doc doc.tkd_tokens.extend(result) return doc def parse(self, doc): """ Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.tkd_trees.extend(trees) # store lexical heads in doc # TODO move to DocumentPlus doc.lex_heads = [] doc.lex_heads.append(None) # end TODO doc.lex_heads.extend(lex_heads) return doc
class PtbParser(object): """Gold parser that gets annotations from the PTB. It uses an instantiated NLTK BracketedParseCorpusReader for the PTB section relevant to the RST DT corpus. Note that the path you give to this will probably end with something like `parsed/mrg/wsj` """ def __init__(self, corpus_dir): """ """ self.reader = BracketParseCorpusReader(corpus_dir, r'../wsj_.*\.mrg', encoding='ascii') def tokenize(self, doc): """Tokenize the document text using the PTB gold annotation. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with tokenization. """ # get tokens from PTB ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get doc text # here we cheat and get it from the RST-DT tree # was: rst_text = doc.orig_rsttree.text() rst_text = doc.text tagged_tokens = self.reader.tagged_words(ptb_name) # tweak tokens THEN filter empty nodes tweaked1, tweaked2 =\ itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in enumerate(tagged_tokens) if not is_empty_category(tok[1])) spans = generic_token_spans(rst_text, tweaked1, txtfn=lambda x: x.tweaked_word) result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)] # store in doc doc.set_tokens(result) return doc def parse(self, doc): """Parse a document, using the gold PTB annotation. Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with syntactic constituency trees. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.set_syn_ctrees(trees, lex_heads=lex_heads) return doc