Example #1
0
    def _load(self):
        # load vocab file
        self.vocab = OrderedDict()
        with open(self.vocab_file, encoding='utf-8') as vf:
            for line in vf.readlines():
                line = line.strip()
                self.vocab[line] = len(self.vocab)

        # filter glove
        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            glove_emb = {}
            with open(self.pretrained_file, 'r', encoding='utf-8') as pf:
                for line in pf.readlines():
                    sp = line.split(' ')
                    if sp[0].lower() in self.vocab:
                        glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]])
        files = ['{}.txt'.format(self.mode)]
        corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files)
        sents = corpus.parsed_sents(files[0])

        #initialize with glove
        pretrained_emb = []
        fail_cnt = 0
        for line in self.vocab.keys():
            if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
                if not line.lower() in glove_emb:
                    fail_cnt += 1
                pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300)))

        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
            print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb)))
        # build trees
        for sent in sents:
            self.trees.append(self._build_tree(sent))
Example #2
0
 def make_dataset(self, corpus: str) -> Dataset:
     reader = BracketParseCorpusReader(*os.path.split(corpus),
                                       encoding=self.encoding,
                                       detect_blocks='sexpr')
     oracles = [DiscOracle.from_tree(t) for t in reader.parsed_sents()]
     examples = [make_example(x, self.fields) for x in oracles]
     return Dataset(examples, self.fields)
Example #3
0
def create_files(d):
    reader = BracketParseCorpusReader(path, ".*parse")
    print >> sys.stderr, "Reader is created now"
    c = 0
    #filetypes = "pos clean-sent aw.tw sense".split()
    filetypes = "pos clean-sent sense".split()
    files = map(lambda x: gzip.open("ontonotes.%s.gz" % x, 'w'), filetypes)
    for parse_file, sentids in d.viewitems():
        parse_file = '/'.join(parse_file.split('/')[-4:])
        sentences = reader.parsed_sents(parse_file)
        for sentid, triple in sentids.viewitems():
            sentence = sentences[sentid]
            clean_sent_list = []
            clean_pos_list = []
            for word, p in sentence.pos():
                if p != '-NONE-':
                    if word in fix:
                        word = fix[word]
                    clean_sent_list.append(word)
                    clean_pos_list.append(p)
            for w, tid, senseid in triple:
                t = clean_sent_list[tid]
                p = clean_pos_list[tid]
                w = w.replace('-', '.')
                mm = "line-{}\t{}\t{}\t{}\t{}\t{}\t{}".format(c, t, c, tid, p, w, tid)
                ss = "line-{}\t{}\t{}\t{}".format(c, t, w, senseid)
                print mm
                write2file(files, [clean_pos_list, clean_sent_list, [ss]])
                c += 1
    map(lambda f: f.close(), files)
Example #4
0
def get_stats_from_snli_dataset(files, tagset=("NN", "NNS"), use_lemmas=False):

    lemmatizer = None
    if use_lemmas:
        lemmatizer = WordNetLemmatizer()

    stats = dd(int)
    num_of_token = 0

    for filename in files:
        f = NamedTemporaryFile()
        fields_to_read = {"sentence1_parse", "sentence2_parse"}
        for sent in open(filename):
            sent = ujson.loads(sent)
            for field in fields_to_read:
                f.write("%s\n" % sent[field])

        reader = BracketParseCorpusReader("/tmp", os.path.basename(f.name))
        for word, tag in reader.tagged_words():
            if tagset is None or tag in tagset:
                if use_lemmas:
                    word = lemmatizer.lemmatize(word, pos=tag.lower()[0])
                stats[word] += 1
                num_of_token += 1

    return stats, num_of_token
Example #5
0
def process_parse_annotation():
    print >> sys.stderr, "Parsing started"
    reader = BracketParseCorpusReader(annotations_path, '.*parse')
    pos_set = set("NN VB RB JJ".split()) # word level pos tags for n, v, adv, adj.
    check_pos = lambda x: x in pos_set
    d = dd(lambda: count(0))
    for fileid in reader.fileids():
        #print fileid
        for sentence in reader.parsed_sents(fileid):
            for word, p in sentence.pos():
                pos = p[0:2]
                if p != '-NONE-' and check_pos(pos):
                     d[pos].next()
    print [(pos, c.next()) for pos, c in d.iteritems()]
Example #6
0
class Treebank:
    def __init__(self, corpus_file: str, lowercase: bool = True) -> None:
        self.corpus_file = corpus_file
        self.lowercase = lowercase
        self._reader = BracketParseCorpusReader(*os.path.split(corpus_file))

    def parsed_sentences(self) -> Iterable[Tree]:
        if self.lowercase:
            return (self.lowercase_leaves(parsed_sent)
                    for parsed_sent in self._reader.parsed_sents())
        else:
            return self._reader.parsed_sents()

    @classmethod
    def lowercase_leaves(cls, tree):
        if isinstance(tree, str):
            return tree.lower()
        return Tree(tree.label(), [cls.lowercase_leaves(child) for child in tree])
Example #7
0
File: ptb.py Project: tjane/educe
def reader(corpus_dir):
    """
    An instantiated NLTK BracketedParseCorpusReader for the PTB
    section relevant to the PDTB corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """
    return BracketParseCorpusReader(corpus_dir,
                                    r'../wsj_.*\.mrg',
                                    encoding='ascii')
Example #8
0
def annotation_process():
    d = get_inventory_info()
    annotated_files = find_files(annotations_path, "*.sense")
    pos_file = gzip.open('on.pos.gz', 'w')
    inst_num_dict = dd(lambda: count(1))
    for num_processed, fn in enumerate(annotated_files):
        if num_processed % 1000 == 0:
            print >> sys.stderr, "{} files processed".format(num_processed)
        directory = os.path.dirname(fn)
        basename = os.path.basename(fn)
        reader = BracketParseCorpusReader(directory, basename.replace('.sense', '.parse'))
        fileid = reader.fileids()[0]
        sentences = dict()
        parsed_sents = reader.parsed_sents(fileid)
        for line in open(fn):
            line = line.split()
            tw = line[3]
            onto_sense = line[-1]
            sent_id, tok_id = int(line[1]), int(line[2])
            stuple = sentences.setdefault(sent_id, None)
            if stuple is None:
                sentence = parsed_sents[sent_id]
                clean_sent = []
                clean_pos = []
                for word, p in sentence.pos():
                    if p != '-NONE-':
                        if word in fix:
                            word = fix[word]
                        clean_sent.append(word)
                        clean_pos.append(p)
                sentences[sent_id] = (clean_sent, clean_pos)
            else:
                clean_sent, clean_pos = stuple
            lexicon_senses, version, ita = d[tw][onto_sense]
            w = tw.replace('-', '.') # following the convention of SemEval
            m = "{}\t{}.on.{}\t{}-{}-{}\t{}-{}\t{}\t{}\t{}\t{}\t{}"
            print m.format(w, w, inst_num_dict[tw].next(), line[0], sent_id, tok_id,
                w, onto_sense, lexicon_senses, version, ita, tok_id, " ".join(clean_sent))
            pos_file.write("{}\n".format(clean_pos))
    print >> sys.stderr, "{} files processed".format(num_processed)
Example #9
0
def text2DGL(source_file, vocab_file, embed_file, word_dim):

    # vocab(stoi): {word : index}
    vocab = OrderedDict()
    with open(vocab_file, encoding='utf-8') as vf:
        for line in vf.readlines():
            line = line.strip()
            vocab[line] = len(vocab)

    # enrich word embedding
    embedding = np.random.random((len(vocab), word_dim))
    with open(embed_file, 'r', encoding='utf-8') as pf:
        for line in pf.readlines():
            sp = line.split(' ')
            if sp[0].lower() in vocab:
                embedding[vocab[sp[0].lower()]] = np.array(
                    [float(x) for x in sp[1:]])

    # build dgl from file
    files = [source_file]
    corpus = BracketParseCorpusReader('{}'.format(""), files)
    sents = corpus.parsed_sents(files[0])
    trees = [build_tree(sent, vocab) for sent in sents]
    return trees, embedding, vocab
Example #10
0
# WIP
# dirty, almost copies from educe.rst_dt.ptb.PtbParser...
# TODO go and fix educe.rst_dt.{ptb, corenlp}
PTB_DIR = os.path.join(
    os.path.dirname(__file__),
    '..',
    '..',
    'data',  # alt: '..', '..', 'corpora',
    'PTBIII',
    'parsed',
    'mrg',
    'wsj')
# FIXME this fails when PTB_DIR does not exist ;
# I need to find a clean way to address this
PTB_READER = BracketParseCorpusReader(PTB_DIR,
                                      r'../wsj_.*\.mrg',
                                      encoding='ascii')


def tokenize_doc_ptb(doc_id, doc_text):
    """Dirty PTB tokenizer"""
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # get doc text
    # here we cheat and get it from the RST-DT tree
    # was: rst_text = doc.orig_rsttree.text()
    rst_text = doc_text
    tagged_tokens = PTB_READER.tagged_words(ptb_name)
    # tweak tokens THEN filter empty nodes
Example #11
0
        ("PP", "P NP", 1.0),
    ])
    lexicon = hw5.MiniLexicon([
        ("N", "people", 0.5),
        ("N", "fish", 0.2),
        ("N", "tanks", 0.2),
        ("N", "rods", 0.1),
        ("V", "people", 0.1),
        ("V", "fish", 0.6),
        ("V", "tanks", 0.3),
        ("P", "with", 1.0),
    ])

else:
    print("Loading training trees")
    train_corpus = BracketParseCorpusReader(args.data,
                                            "en-wsj-train.1.mrg")

    if args.validate:
        print("Loading in-domain validation trees")
        in_domain = BracketParseCorpusReader(args.data,
                                             "en-wsj-dev.2.mrg")

        print("Loading out-of-domain validation trees")
        out_of_domain = BracketParseCorpusReader(args.data,
                                                 "en-web-dev.3.mrg")

    if args.test:
        print("Loading test sentences")
        test_sentences = [line.strip().split()
                          for line in open(os.path.join(args.data,
                                                        "en-web-weblogs-test"
Example #12
0
#usage: hw4_topcfg.sh <treebank_filename> <output_PCFG_file>
if __name__ == "__main__":
    PATH_TRAIN = sys.argv[1]
    out = sys.argv[2]

    match = re.search("(?s:.*)/", PATH_TRAIN)
    if match:
        DIR_TRAIN = re.search("(?s:.*)/", PATH_TRAIN).group(0)
    else:
        DIR_TRAIN = os.getcwd()

    # read in parsed corpus
    with open(PATH_TRAIN) as f:
        data = f.read()
    parsed_data = BracketParseCorpusReader(DIR_TRAIN,
                                           'parses.train').parsed_sents()

    # get counts of all non-terminals
    counts_nodes = Counter(re.findall("\(([A-Z_]+) ", data))

    # get counts of all rules
    list_counts_rules = []
    for sent in parsed_data:
        traverse_tree(sent, list_counts_rules)
    counts_rules = Counter(list_counts_rules)

    prob_rules = dict()
    for rule in counts_rules:
        node = re.findall("([A-Z_]+)", rule)[0]
        prob_rules[rule] = counts_rules[rule] / counts_nodes[node]
Example #13
0
File: ptb.py Project: fbuijs/educe
 def __init__(self, corpus_dir):
     """ """
     self.reader = BracketParseCorpusReader(corpus_dir,
                                            r'../wsj_.*\.mrg',
                                            encoding='ascii')
Example #14
0
File: ptb.py Project: fbuijs/educe
class PtbParser(object):
    """Gold parser that gets annotations from the PTB.

    It uses an instantiated NLTK BracketedParseCorpusReader
    for the PTB section relevant to the RST DT corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """

    def __init__(self, corpus_dir):
        """ """
        self.reader = BracketParseCorpusReader(corpus_dir,
                                               r'../wsj_.*\.mrg',
                                               encoding='ascii')

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Return a tokenized document.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text, tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.tkd_tokens.extend(result)

        return doc

    def parse(self, doc):
        """
        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                                tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.tkd_trees.extend(trees)
        # store lexical heads in doc
        # TODO move to DocumentPlus
        doc.lex_heads = []
        doc.lex_heads.append(None)
        # end TODO
        doc.lex_heads.extend(lex_heads)

        return doc
Example #15
0
 def __init__(self, corpus_file: str, lowercase: bool = True) -> None:
     self.corpus_file = corpus_file
     self.lowercase = lowercase
     self._reader = BracketParseCorpusReader(*os.path.split(corpus_file))
Example #16
0
 def __init__(self, corpus_dir):
     """ """
     self.reader = BracketParseCorpusReader(corpus_dir,
                                            r'../wsj_.*\.mrg',
                                            encoding='ascii')
Example #17
0
class PtbParser(object):
    """Gold parser that gets annotations from the PTB.

    It uses an instantiated NLTK BracketedParseCorpusReader
    for the PTB section relevant to the RST DT corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """
    def __init__(self, corpus_dir):
        """ """
        self.reader = BracketParseCorpusReader(corpus_dir,
                                               r'../wsj_.*\.mrg',
                                               encoding='ascii')

    def tokenize(self, doc):
        """Tokenize the document text using the PTB gold annotation.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with tokenization.
        """
        # get tokens from PTB
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get doc text
        # here we cheat and get it from the RST-DT tree
        # was: rst_text = doc.orig_rsttree.text()
        rst_text = doc.text
        tagged_tokens = self.reader.tagged_words(ptb_name)
        # tweak tokens THEN filter empty nodes
        tweaked1, tweaked2 =\
            itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                          enumerate(tagged_tokens)
                          if not is_empty_category(tok[1]))
        spans = generic_token_spans(rst_text,
                                    tweaked1,
                                    txtfn=lambda x: x.tweaked_word)
        result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]

        # store in doc
        doc.set_tokens(result)

        return doc

    def parse(self, doc):
        """Parse a document, using the gold PTB annotation.

        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with syntactic
            constituency trees.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.set_syn_ctrees(trees, lex_heads=lex_heads)

        return doc