Exemple #1
0
    def get_train_and_dev(self, train_file_path, grammar_file,
                          primitive_types):
        src_freq = 3
        code_freq = 3
        grammar = ASDLGrammar.grammar_from_text(
            open(grammar_file).read(), primitive_types)
        transition_system = TransitionSystem(grammar)
        train_examples = self.preprocess_dataset(train_file_path,
                                                 transition_system)

        full_train_examples = train_examples[:]
        np.random.shuffle(train_examples)
        dev_examples = train_examples[:200]
        train_examples = train_examples[200:]

        src_vocab = VocabEntry.from_corpus(
            [e.sentence for e in train_examples],
            size=5000,
            freq_cutoff=src_freq)
        primitive_tokens = [
            map(
                lambda a: a.action.token,
                filter(lambda a: isinstance(a.action, GenTokenAction),
                       e.tgt_actions)) for e in train_examples
        ]
        primitive_vocab = VocabEntry.from_corpus(primitive_tokens,
                                                 size=5000,
                                                 freq_cutoff=code_freq)

        # generate vocabulary for the code tokens!
        code_tokens = [
            transition_system.tokenize_code(e.code, mode='decoder')
            for e in train_examples
        ]
        code_vocab = VocabEntry.from_corpus(code_tokens,
                                            size=5000,
                                            freq_cutoff=code_freq)

        vocab = Vocab(source=src_vocab,
                      primitive=primitive_vocab,
                      code=code_vocab)

        return train_examples, dev_examples, vocab
    def _build_vocab(self, vocab_file, method, cased):
        corpus = []

        for dataset_type in ["train", "test"]:
            data_file = "qna_data/vi_{}.json".format(dataset_type)

            json_samples = read_json_data(data_file)

            for json_sample in json_samples:
                for key in ["question", "text"]:
                    pre_key = "{}_{}_{}".format(
                        method, cased, key
                    )
                    pre_text = json_sample[pre_key]

                    corpus.append(pre_text.split())

        self.vocab = VocabEntry.from_corpus(corpus, freq_cutoff=3)
        self.vocab.save_json(vocab_file)
        print ("Save vocab to file {}".format(vocab_file))
Exemple #3
0
def extract_BNC():
    stpwd = stopwords.words('english')
    stpwd = [w.upper() for w in stpwd]
    print(stpwd)
    vocab_path = paths.vocab_bnc
    source_path = paths.bnc_folder
    target_path = paths.bnc_extracted
    ndocs = 0
    dataset = []
    for (dirpath, dirnames, filenames) in os.walk(source_path):
        for fn in filenames:
            if fn.endswith(".xml"):
                print(dirpath, fn)
                fp = dirpath+"/"+fn
                tree = ET.parse(fp).getroot()
                sentences = find_rec(tree, "s", [])
                print(len(sentences))
                for sent in sentences:
                    st = []
                    for word in sent.findall("w"):
                        w = word.attrib["hw"]
                        pos = word.attrib["pos"]
                        if w.upper() not in stpwd:
                            st.append(w+"_"+pos)
                    st = ["<s>"] * cconfig.context_size + st + ["<s>"] * cconfig.context_size
                    dataset.append(st)
                ndocs += 1
        if ndocs > 1000:
            break

    vocab = VocabEntry.from_corpus(
        dataset, size=cconfig.vocab_size, freq_cutoff=cconfig.freq_cutoff)
    pickle.dump(vocab, open(vocab_path, 'wb'))
    processed_dataset = []
    for sent in dataset:
        np_sent = np.array([vocab[w] for w in sent if vocab[w] != 3])
        processed_dataset.append(np_sent)
    processed_dataset = np.array(processed_dataset)
    np.save(target_path, processed_dataset)
    print(len(processed_dataset))
    print(sum([len(s) for s in processed_dataset]))
Exemple #4
0
 def load_vocab(self):
     # Load the vocabulary or create vocabulary if not exists
     if self.args.vocab is not None:
         if not os.path.isfile(self.args.vocab):
             print('create new vocab and save to %s' % self.args.vocab)
             corpus = []
             for story in self.trn[0]:
                 for sent in story:
                     corpus.append(sent)
             if self.args.rebuild_vocab:
                 self.vocab = VocabEntry.from_corpus(
                     corpus,
                     50000,
                     remove_singleton=not self.args.include_singleton)
             else:
                 self.vocab = VocabEntry.from_dict(self.w2id)
             torch.save(self.vocab, self.args.vocab)
         else:
             self.vocab = torch.load(self.args.vocab)
     else:
         print('vocab file is required')
         exit(0)
Exemple #5
0
    def build_data(self):
        corpus = self.train_paragraph_texts + \
                 self.train_question_texts + \
                 self.test_question_texts + \
                 self.test_paragraph_texts

        vocab = VocabEntry.from_corpus(corpus, freq_cutoff=1)

        vocab_file = "qna_data/{}_vocab.json".format(self.method)
        vocab.save_json(vocab_file)

        self.train_questions = vocab.padd_sents(self.train_question_texts,
                                                start_end=False)
        self.train_paragraphs = vocab.padd_sents(self.train_paragraph_texts,
                                                 start_end=False)
        self.test_questions = vocab.padd_sents(self.test_question_texts,
                                               start_end=False)
        self.test_paragraphs = vocab.padd_sents(self.test_paragraph_texts,
                                                start_end=False)

        save_data = {
            "train_questions": self.train_questions,
            "train_paragraphs": self.train_paragraphs,
            "test_questions": self.test_questions,
            "test_paragraphs": self.test_paragraphs,
        }

        save_file = "qna_data/{}_dataset.json".format(self.method)
        write_json_data(save_file, save_data)

        self.vocab = vocab

        self._to_numpy()

        print("corpus len: ", len(corpus))
        print(corpus[0])
        print("max length: ", vocab.max_sent_len)
Exemple #6
0
def main(argv):
    EPOCHS = int(argv[0])
    BATCH_SIZE = int(argv[1])
    LEARNING_RATE = float(argv[2])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    words = []
    definitions = []
    sub_fasttext_dict = {}
    with open("../data/words_defs_dict.train", "br") as f:
        words, definitions, sub_fasttext_dict = pickle.load(f)
    print("number of words:", len(words))

    eval = Evaluator()
    vocab = VocabEntry.from_corpus(definitions, 1000000, 0)
    for w in words:
        vocab.add(w)
    print("vocab length:", len(vocab))

    assert (len(words) == len(definitions))
    training_data = [(definitions[i], words[i]) for i in range(len(words))]

    model = LSTMModel(100, 100, vocab, sub_fasttext_dict, device)
    model.to(device)
    loss_function = nn.CosineEmbeddingLoss(margin=0.0, reduction='mean')
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    dt = str(datetime.fromtimestamp(time.time()))[:-7]
    print(dt)
    dt = dt.replace(' ', '_')
    dt = dt.replace(':', '-')
    start = timeit.default_timer()
    losses = []

    best_loss = float('inf')
    for epoch in range(EPOCHS):
        epoch_losses = []
        count = 0
        for src_sents, tgt_word in batch_iter(training_data, BATCH_SIZE,
                                              False):
            model.zero_grad()
            x_lengths = [len(sent) for sent in src_sents]
            x = vocab.to_input_tensor(src_sents, device)
            init_hidden = model.initHidden(len(src_sents), device)
            tag_scores = model.forward(x, init_hidden, x_lengths)

            y_indices = vocab.words2indices(tgt_word)
            y_array = model.embedding.source[0](torch.tensor(
                y_indices, device=device)).double()
            y_pred = tag_scores[0].squeeze(dim=1).double().to(device)
            y_match = torch.ones(y_pred.shape[0], device=device)

            loss = loss_function(y_pred, y_array, y_match)

            loss.backward()
            optimizer.step()
            count += 1
            epoch_losses.append(loss)
            if count % 200 == 0:
                print("Time elapsed",
                      timeit.default_timer() - start, "Epoch", epoch, " Count",
                      count, ": Loss", loss)
                losses.append(loss)
        eloss = sum(epoch_losses) / len(epoch_losses)
        if eloss < best_loss:
            best_loss = eloss
            title = 'ft_model' + dt + '.pt'
            torch.save(model.state_dict(), title)
            print("model saves as:", title, "with epoch loss of ", eloss)

    stop = timeit.default_timer()

    print('Time: ', stop - start)

    import matplotlib.pyplot as plt
    print(plt.plot([l.double() for l in losses]))
Exemple #7
0
datetm = datetime.fromtimestamp(ts)
dt = str(datetm)[:-7]
print(dt)
dt = dt.replace(' ', '_')
dt = dt.replace(':', '-')
print(dt)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('use device: %s' % device)

# In[12]:

words, defs, ft_dict = pickle.load(
    open("../data/words_defs_dict_1M.train", "rb"))

vocab = VocabEntry.from_corpus(defs, 1000000, 0)
for w in ft_dict:
    vocab.add(w)

# In[13]:


def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=True):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(
        torch.from_numpy(weights_matrix))  #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim