コード例 #1
0
ファイル: onto-testset-create.py プロジェクト: ai-ku/uwsd
def create_files(d):
    reader = BracketParseCorpusReader(path, ".*parse")
    print >> sys.stderr, "Reader is created now"
    c = 0
    #filetypes = "pos clean-sent aw.tw sense".split()
    filetypes = "pos clean-sent sense".split()
    files = map(lambda x: gzip.open("ontonotes.%s.gz" % x, 'w'), filetypes)
    for parse_file, sentids in d.viewitems():
        parse_file = '/'.join(parse_file.split('/')[-4:])
        sentences = reader.parsed_sents(parse_file)
        for sentid, triple in sentids.viewitems():
            sentence = sentences[sentid]
            clean_sent_list = []
            clean_pos_list = []
            for word, p in sentence.pos():
                if p != '-NONE-':
                    if word in fix:
                        word = fix[word]
                    clean_sent_list.append(word)
                    clean_pos_list.append(p)
            for w, tid, senseid in triple:
                t = clean_sent_list[tid]
                p = clean_pos_list[tid]
                w = w.replace('-', '.')
                mm = "line-{}\t{}\t{}\t{}\t{}\t{}\t{}".format(c, t, c, tid, p, w, tid)
                ss = "line-{}\t{}\t{}\t{}".format(c, t, w, senseid)
                print mm
                write2file(files, [clean_pos_list, clean_sent_list, [ss]])
                c += 1
    map(lambda f: f.close(), files)
コード例 #2
0
    def _load(self):
        # load vocab file
        self.vocab = OrderedDict()
        with open(self.vocab_file, encoding='utf-8') as vf:
            for line in vf.readlines():
                line = line.strip()
                self.vocab[line] = len(self.vocab)

        # filter glove
        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            glove_emb = {}
            with open(self.pretrained_file, 'r', encoding='utf-8') as pf:
                for line in pf.readlines():
                    sp = line.split(' ')
                    if sp[0].lower() in self.vocab:
                        glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]])
        files = ['{}.txt'.format(self.mode)]
        corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files)
        sents = corpus.parsed_sents(files[0])

        #initialize with glove
        pretrained_emb = []
        fail_cnt = 0
        for line in self.vocab.keys():
            if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
                if not line.lower() in glove_emb:
                    fail_cnt += 1
                pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300)))

        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
            print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb)))
        # build trees
        for sent in sents:
            self.trees.append(self._build_tree(sent))
コード例 #3
0
ファイル: trainer.py プロジェクト: uMiss/pytorch-rnng
 def make_dataset(self, corpus: str) -> Dataset:
     reader = BracketParseCorpusReader(*os.path.split(corpus),
                                       encoding=self.encoding,
                                       detect_blocks='sexpr')
     oracles = [DiscOracle.from_tree(t) for t in reader.parsed_sents()]
     examples = [make_example(x, self.fields) for x in oracles]
     return Dataset(examples, self.fields)
コード例 #4
0
class Treebank:
    def __init__(self, corpus_file: str, lowercase: bool = True) -> None:
        self.corpus_file = corpus_file
        self.lowercase = lowercase
        self._reader = BracketParseCorpusReader(*os.path.split(corpus_file))

    def parsed_sentences(self) -> Iterable[Tree]:
        if self.lowercase:
            return (self.lowercase_leaves(parsed_sent)
                    for parsed_sent in self._reader.parsed_sents())
        else:
            return self._reader.parsed_sents()

    @classmethod
    def lowercase_leaves(cls, tree):
        if isinstance(tree, str):
            return tree.lower()
        return Tree(tree.label(), [cls.lowercase_leaves(child) for child in tree])
コード例 #5
0
ファイル: ontonotes-stats.py プロジェクト: ai-ku/uwsd
def process_parse_annotation():
    print >> sys.stderr, "Parsing started"
    reader = BracketParseCorpusReader(annotations_path, '.*parse')
    pos_set = set("NN VB RB JJ".split()) # word level pos tags for n, v, adv, adj.
    check_pos = lambda x: x in pos_set
    d = dd(lambda: count(0))
    for fileid in reader.fileids():
        #print fileid
        for sentence in reader.parsed_sents(fileid):
            for word, p in sentence.pos():
                pos = p[0:2]
                if p != '-NONE-' and check_pos(pos):
                     d[pos].next()
    print [(pos, c.next()) for pos, c in d.iteritems()]
コード例 #6
0
ファイル: ontonotes-preprocess.py プロジェクト: ai-ku/uwsd
def annotation_process():
    d = get_inventory_info()
    annotated_files = find_files(annotations_path, "*.sense")
    pos_file = gzip.open('on.pos.gz', 'w')
    inst_num_dict = dd(lambda: count(1))
    for num_processed, fn in enumerate(annotated_files):
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)
        directory = os.path.dirname(fn)
        basename = os.path.basename(fn)
        reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse'))
        fileid = reader.fileids()[0]
        sentences = dict()
        parsed_sents = reader.parsed_sents(fileid)
        for line in open(fn):
            line = line.split()
            tw = line[3]
            onto_sense = line[-1]
            sent_id, tok_id = int(line[1]), int(line[2])
            stuple = sentences.setdefault(sent_id, None)
            if stuple is None:
                sentence = parsed_sents[sent_id]
                clean_sent = []
                clean_pos = []
                for word, p in sentence.pos():
                    if p != '-NONE-':
                        if word in fix:
                            word = fix[word]
                        clean_sent.append(word)
                        clean_pos.append(p)
                sentences[sent_id] = (clean_sent, clean_pos)
            else:
                clean_sent, clean_pos = stuple
            lexicon_senses, version, ita = d[tw][onto_sense]
            w = tw.replace('-', '.') # following the convention of SemEval
            m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}"
            print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id,
                w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent))
            pos_file.write("{}\n".format(clean_pos))
    print >> sys.stderr, "{} files processed".format(num_processed)
コード例 #7
0
ファイル: Utils.py プロジェクト: Shengjie-Liu/Tree_LSTM_CS584
def text2DGL(source_file, vocab_file, embed_file, word_dim):

    # vocab(stoi): {word : index}
    vocab = OrderedDict()
    with open(vocab_file, encoding='utf-8') as vf:
        for line in vf.readlines():
            line = line.strip()
            vocab[line] = len(vocab)

    # enrich word embedding
    embedding = np.random.random((len(vocab), word_dim))
    with open(embed_file, 'r', encoding='utf-8') as pf:
        for line in pf.readlines():
            sp = line.split(' ')
            if sp[0].lower() in vocab:
                embedding[vocab[sp[0].lower()]] = np.array(
                    [float(x) for x in sp[1:]])

    # build dgl from file
    files = [source_file]
    corpus = BracketParseCorpusReader('{}'.format(""), files)
    sents = corpus.parsed_sents(files[0])
    trees = [build_tree(sent, vocab) for sent in sents]
    return trees, embedding, vocab
コード例 #8
0
ファイル: hw5_init.py プロジェクト: pombredanne/nlp
    tree = parser.generate_parse_tree(["fish", "people", "fish", "tanks"],
                                      root_tag="S", theta=args.theta)
    tree.draw()

else:
    if args.validate:
        validation_fn = os.path.join(args.data, "validation.txt")
        open(validation_fn, "a").write(("h: {0} v: {1} theta: {2} "
                                        "maxTrain: {3} maxValid: {4}\n")
                                       .format(args.horizontal,
                                               args.vertical,
                                               args.theta,
                                               args.maxTrain, args.maxValid))

        evaluator = hw5.Evaluator(["ROOT", "TOP"], ["''", "``", ".", ":", ","])
        for gold in out_of_domain.parsed_sents():
            sentence = map(unicode, gold.leaves())
            if len(sentence) > args.maxValid:
                continue
            guess = parser.generate_parse_tree(sentence, theta=args.theta)
            guess.un_chomsky_normal_form()
            evaluator(guess, gold)
            print("F1 = {0}".format(evaluator.get_f1()))
        open(validation_fn, "a").write("out-of-domain: {0}\n"
                                       .format(evaluator.get_f1()))

        evaluator = hw5.Evaluator(["ROOT", "TOP"], ["''", "``", ".", ":", ","])
        for gold in in_domain.parsed_sents():
            sentence = map(unicode, gold.leaves())
            if len(sentence) > args.maxValid:
                continue
コード例 #9
0
ファイル: ptb.py プロジェクト: fbuijs/educe
class PtbParser(object):
    """Gold parser that gets annotations from the PTB.

    It uses an instantiated NLTK BracketedParseCorpusReader
    for the PTB section relevant to the RST DT corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """

    def __init__(self, corpus_dir):
        """ """
        self.reader = BracketParseCorpusReader(corpus_dir,
                                               r'../wsj_.*\.mrg',
                                               encoding='ascii')

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Return a tokenized document.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text, tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.tkd_tokens.extend(result)

        return doc

    def parse(self, doc):
        """
        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                                tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.tkd_trees.extend(trees)
        # store lexical heads in doc
        # TODO move to DocumentPlus
        doc.lex_heads = []
        doc.lex_heads.append(None)
        # end TODO
        doc.lex_heads.extend(lex_heads)

        return doc
コード例 #10
0
class PtbParser(object):
    """Gold parser that gets annotations from the PTB.

    It uses an instantiated NLTK BracketedParseCorpusReader
    for the PTB section relevant to the RST DT corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """
    def __init__(self, corpus_dir):
        """ """
        self.reader = BracketParseCorpusReader(corpus_dir,
                                               r'../wsj_.*\.mrg',
                                               encoding='ascii')

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with tokenization.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text,
                                    tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.set_tokens(result)

        return doc

    def parse(self, doc):
        """Parse a document, using the gold PTB annotation.

        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with syntactic
            constituency trees.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.set_syn_ctrees(trees, lex_heads=lex_heads)

        return doc