Esempio n. 1
0
    def _load(self):
        # load vocab file
        self.vocab = OrderedDict()
        with open(self.vocab_file, encoding='utf-8') as vf:
            for line in vf.readlines():
                line = line.strip()
                self.vocab[line] = len(self.vocab)

        # filter glove
        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            glove_emb = {}
            with open(self.pretrained_file, 'r', encoding='utf-8') as pf:
                for line in pf.readlines():
                    sp = line.split(' ')
                    if sp[0].lower() in self.vocab:
                        glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]])
        files = ['{}.txt'.format(self.mode)]
        corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files)
        sents = corpus.parsed_sents(files[0])

        #initialize with glove
        pretrained_emb = []
        fail_cnt = 0
        for line in self.vocab.keys():
            if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
                if not line.lower() in glove_emb:
                    fail_cnt += 1
                pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300)))

        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
            print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb)))
        # build trees
        for sent in sents:
            self.trees.append(self._build_tree(sent))
Esempio n. 2
0
 def make_dataset(self, corpus: str) -> Dataset:
     reader = BracketParseCorpusReader(*os.path.split(corpus),
                                       encoding=self.encoding,
                                       detect_blocks='sexpr')
     oracles = [DiscOracle.from_tree(t) for t in reader.parsed_sents()]
     examples = [make_example(x, self.fields) for x in oracles]
     return Dataset(examples, self.fields)
Esempio n. 3
0
def get_stats_from_snli_dataset(files, tagset=("NN", "NNS"), use_lemmas=False):

    lemmatizer = None
    if use_lemmas:
        lemmatizer = WordNetLemmatizer()

    stats = dd(int)
    num_of_token = 0

    for filename in files:
        f = NamedTemporaryFile()
        fields_to_read = {"sentence1_parse", "sentence2_parse"}
        for sent in open(filename):
            sent = ujson.loads(sent)
            for field in fields_to_read:
                f.write("%s\n" % sent[field])

        reader = BracketParseCorpusReader("/tmp", os.path.basename(f.name))
        for word, tag in reader.tagged_words():
            if tagset is None or tag in tagset:
                if use_lemmas:
                    word = lemmatizer.lemmatize(word, pos=tag.lower()[0])
                stats[word] += 1
                num_of_token += 1

    return stats, num_of_token
Esempio n. 4
0
File: ptb.py Progetto: tjane/educe
def reader(corpus_dir):
    """
    An instantiated NLTK BracketedParseCorpusReader for the PTB
    section relevant to the PDTB corpus.

    Note that the path you give to this will probably end with
    something like `parsed/mrg/wsj`
    """
    return BracketParseCorpusReader(corpus_dir,
                                    r'../wsj_.*\.mrg',
                                    encoding='ascii')
Esempio n. 5
0
def text2DGL(source_file, vocab_file, embed_file, word_dim):

    # vocab(stoi): {word : index}
    vocab = OrderedDict()
    with open(vocab_file, encoding='utf-8') as vf:
        for line in vf.readlines():
            line = line.strip()
            vocab[line] = len(vocab)

    # enrich word embedding
    embedding = np.random.random((len(vocab), word_dim))
    with open(embed_file, 'r', encoding='utf-8') as pf:
        for line in pf.readlines():
            sp = line.split(' ')
            if sp[0].lower() in vocab:
                embedding[vocab[sp[0].lower()]] = np.array(
                    [float(x) for x in sp[1:]])

    # build dgl from file
    files = [source_file]
    corpus = BracketParseCorpusReader('{}'.format(""), files)
    sents = corpus.parsed_sents(files[0])
    trees = [build_tree(sent, vocab) for sent in sents]
    return trees, embedding, vocab
Esempio n. 6
0
#usage: hw4_topcfg.sh <treebank_filename> <output_PCFG_file>
if __name__ == "__main__":
    PATH_TRAIN = sys.argv[1]
    out = sys.argv[2]

    match = re.search("(?s:.*)/", PATH_TRAIN)
    if match:
        DIR_TRAIN = re.search("(?s:.*)/", PATH_TRAIN).group(0)
    else:
        DIR_TRAIN = os.getcwd()

    # read in parsed corpus
    with open(PATH_TRAIN) as f:
        data = f.read()
    parsed_data = BracketParseCorpusReader(DIR_TRAIN,
                                           'parses.train').parsed_sents()

    # get counts of all non-terminals
    counts_nodes = Counter(re.findall("\(([A-Z_]+) ", data))

    # get counts of all rules
    list_counts_rules = []
    for sent in parsed_data:
        traverse_tree(sent, list_counts_rules)
    counts_rules = Counter(list_counts_rules)

    prob_rules = dict()
    for rule in counts_rules:
        node = re.findall("([A-Z_]+)", rule)[0]
        prob_rules[rule] = counts_rules[rule] / counts_nodes[node]
Esempio n. 7
0
# WIP
# dirty, almost copies from educe.rst_dt.ptb.PtbParser...
# TODO go and fix educe.rst_dt.{ptb, corenlp}
PTB_DIR = os.path.join(
    os.path.dirname(__file__),
    '..',
    '..',
    'data',  # alt: '..', '..', 'corpora',
    'PTBIII',
    'parsed',
    'mrg',
    'wsj')
# FIXME this fails when PTB_DIR does not exist ;
# I need to find a clean way to address this
PTB_READER = BracketParseCorpusReader(PTB_DIR,
                                      r'../wsj_.*\.mrg',
                                      encoding='ascii')


def tokenize_doc_ptb(doc_id, doc_text):
    """Dirty PTB tokenizer"""
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # get doc text
    # here we cheat and get it from the RST-DT tree
    # was: rst_text = doc.orig_rsttree.text()
    rst_text = doc_text
    tagged_tokens = PTB_READER.tagged_words(ptb_name)
    # tweak tokens THEN filter empty nodes
Esempio n. 8
0
 def __init__(self, corpus_dir):
     """ """
     self.reader = BracketParseCorpusReader(corpus_dir,
                                            r'../wsj_.*\.mrg',
                                            encoding='ascii')
Esempio n. 9
0
 def __init__(self, corpus_file: str, lowercase: bool = True) -> None:
     self.corpus_file = corpus_file
     self.lowercase = lowercase
     self._reader = BracketParseCorpusReader(*os.path.split(corpus_file))