コード例 #1
0
    def __init__(self, num_labels):
        super(QuestionClassifier, self).__init__()
        # n_hidden = 256
        self.f1 = nn.Linear(int(conf.get("param", "word_embedding_dim")),
                            num_labels)
        # self.f2 = nn.Linear(n_hidden, num_labels)

        self.double()
        # loss
        self.loss_function = nn.CrossEntropyLoss()
        # optimizer
        self.optimizer = optim.Adam(self.parameters(),
                                    lr=float(conf.get("param", "lr_param")))
コード例 #2
0
def train():
    train_sentence_vectors, train_labels, dev_sentence_vectors, dev_labels, test_sentence_vectors, test_labels = sentence_vector.bag_of_word_sentences(
        type=conf.get("param", "word_embedding_type"), freeze=True)

    output_size = len(set(train_labels))
    model = QuestionClassifier(output_size)
    # save test data
    model.test_vecs = test_sentence_vectors
    model.test_label = test_labels
    model.label_to_ix = gv.label_to_ix
    for epoch in range(int(conf.get("param", "epoch"))):
        model.train_model(train_sentence_vectors, train_labels)
        # validate the model
        acc = model.test_model(dev_sentence_vectors, dev_labels)
        print('epoch:', epoch, ' dev_acc: ', acc)
    torch.save(model, conf.get("param", "path_model"))
コード例 #3
0
def test():
    _, _, _, label2idx = process_train_set('../data/train.txt')
    idx2label = dict(zip(label2idx.values(), label2idx.keys()))

    model = torch.load(conf.get('param', 'path_model'))
    model.to('cpu')

    acc, pre_label = model.test_model(model.test_vecs, model.test_label)
    print('test_acc: ', acc)

    with open('../data/test.txt', 'r') as f:
        data = f.readlines()
        labels = []
        sentences = []
        for line in data:
            s = line.split(' ', maxsplit=1)
            labels.append(s[0])
            sentences.append(s[1][:-1])
    with open(gv.conf.get('param', 'path_eval_result'), "w") as f:
        lines = [
            'Question                  Correct Label               Predict Label\n'
        ]
        for i in range(len(sentences)):
            line = [sentences[i], labels[i], idx2label[int(pre_label[i])]]
            s = '       '.join(line)
            s += '\n'
            lines.append(s)
        f.writelines(lines)
コード例 #4
0
def train():
    bilstm_test.train_Bilstm()
    train_sentence_vectors, train_labels = readFile("../data/train_.txt")
    dev_sentence_vectors, dev_labels = readFile("../data/dev_.txt")
    test_sentence_vectors, test_labels = readFile("../data/test_.txt")

    output_size = len(set(train_labels))
    model = QuestionClassifier(output_size)

    model.test_vecs = test_sentence_vectors
    model.test_label = test_labels

    for epoch in range(int(conf.get("param", "epoch"))):
        model.train_model(train_sentence_vectors, train_labels)
        acc, labels = model.test_model(dev_sentence_vectors, dev_labels)
        print('epoch:', epoch, 'dev_acc: ', acc)
    torch.save(model, conf.get("param", "path_model"))
コード例 #5
0
def test():
    model = torch.load(conf.get('param', 'path_model'))
    model.to('cpu')

    # test the model
    acc = model.test_model(model.test_vecs,
                           model.test_label,
                           output_predict=True)
    print('test_acc: ', acc)
コード例 #6
0
def make_bow_vector(tokens,wordToIdx,wordVec):
    vec = np.zeros(int(conf.get("param","word_embedding_dim")))
    count = 0
    for word in tokens:
        if word in wordToIdx.keys():
            vector = wordVec[wordToIdx[word]]
            vec += vector
            count+=1
    vec = vec / count
    vec = torch.from_numpy(vec)
    return vec.view(1, -1)
コード例 #7
0
def to_dataloader(feature, labels):
    labels = np.array(labels)
    train_data = TensorDataset(torch.from_numpy(feature),
                               torch.from_numpy(labels))
    # dataloaders
    batch_size = int(conf.get('param', 'batch_size'))

    # make sure the SHUFFLE your training data
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

    return train_loader
コード例 #8
0
def get_pre_train_vector():
    print('Please wait, pre-train...')
    sentences = word2vec.preprocessing.get_preprocessed_sentences()
    sorted_words = word2vec.preprocessing.make_vocabulary(sentences)
    word_idx, idx_word = word2vec.create_dict(sorted_words)

    sentences_in_idx = word2vec.replace_words_with_idx(sentences, word_idx)
    # this
    word_to_vec = word2vec.train(len(sorted_words), int(conf.get('param', 'word_embedding_dim')), sentences_in_idx,
                                 idx_word)

    return word_to_vec, word_idx
コード例 #9
0
def get_train_dev():
    # open the data file
    path = os.path.join(os.getcwd(), "..", "data", "train_5500.txt")
    f = open(path)
    lines = f.readlines()
    f.close()

    # split it into 1:9
    train, dev = random_split(lines, shuffle=True, ratio=0.9)

    # write the train data into the train.txt
    file = open(conf.get('param', 'path_train'), 'w')
    for i in range(len(train)):
        file.write(train[i])
    file.close()

    # write the dev data into the train.txt
    file = open(conf.get('param', 'path_dev'), 'w')
    for i in range(len(dev)):
        file.write(dev[i])
    file.close()
コード例 #10
0
def train_Bilstm():
    train_int_word, train_int_label, word2idx, lable2idx = process_train_set(
        conf.get('param', 'path_train'))
    test_int_word, test_int_label = process_new_dataset(
        word2idx, lable2idx, conf.get('param', 'path_dev'))

    seq_length = 10
    feature_metrix = padding_feature(train_int_word, seq_length)
    feature_metrix_dev = padding_feature(test_int_word, seq_length)

    train_loader = to_dataloader(feature_metrix, train_int_label)
    test_loader = to_dataloader(feature_metrix_dev, test_int_label)

    rnn_ = BiLSTMTagger(len(word2idx),
                        int(conf.get("param", "word_embedding_dim")), 100)

    if bool(conf.get('param', 'pre_train')):
        word2vec = w2v.read_word2vec(conf.get("param", "path_pre_emb"))
        vocab = torch.tensor(word2vec)
        pretrained_embedding = vocab
        print('pretrained_embedding:', pretrained_embedding.shape)
        rnn_.embedding.from_pretrained(pretrained_embedding,
                                       freeze=bool(conf.get('param',
                                                            'freeze')))

    device = 'cpu'
    optimizer = optim.Adam(rnn_.parameters(),
                           lr=float(conf.get("param", "lr_param")))
    criteon = nn.CrossEntropyLoss().to(device)
    rnn_.to(device)
    # print(rnn_.to(device))

    for epoch in range(int(conf.get("param", "epoch"))):
        cells = train_another_new(rnn_, train_loader, optimizer, criteon)
        eval(rnn_,
             test_loader,
             criteon,
             patience=int(conf.get('param', 'early_stopping')))
    torch.save(rnn_, '../data/word2vec_Bilstm_3.pkl')
    #print('```````````````')
    classifier.vector_file()
コード例 #11
0
def randomly_initialised_vectors(tokens=None,threshold=None):
    wordCountDict = dict(zip(*np.unique(tokens, return_counts=True)))
    for k in list(wordCountDict.keys()):  # 对字典a中的keys,相当于形成列表list
        if wordCountDict[k] < threshold:
            del wordCountDict[k]

    wordToIx = {}
    wordToIx['UNK'] = 0
    i = 1
    for key in wordCountDict.keys():
        wordToIx[key] = i
        i = i+1
    word_vectors = []
    for _ in wordToIx:
        word_vectors.append(np.random.random(int(conf.get("param","word_embedding_dim"))))
    word_vectors = np.array(word_vectors)
    return word_vectors,wordToIx
コード例 #12
0
def bag_of_word_sentences(type='randomly',freeze=True):
    if type not in ['randomly','pre_train']: return
    train_labels, train_sentences = sentence_processing(conf.get('param', 'path_train'))
    dev_labels, dev_sentences = sentence_processing(conf.get('param', 'path_dev'))
    test_labels, test_sentences = sentence_processing(conf.get('param', 'path_test'))

    train_sentences = lower_first_letter(train_sentences,conf.get('param','lowercase'))
    test_sentences = lower_first_letter(test_sentences,conf.get('param','lowercase'))
    dev_sentences = lower_first_letter(dev_sentences,conf.get('param','lowercase'))

    read_stop = read_stoplist()

    train_tokens, train_token_of_sentences = tokenization(train_sentences, read_stop)
    dev_tokens, dev_token_of_sentences = tokenization(dev_sentences, read_stop)
    test_tokens, test_token_of_sentences = tokenization(test_sentences, read_stop)
    wordVec, wordToIdx = get_word_embedding(tokens=train_tokens, type=type, freeze=freeze, path='../to_be_merged/train_1000.txt')

    train_sentence_vectors = multi_sentences_to_vectors(train_token_of_sentences,wordToIdx,wordVec)
    test_sentence_vectors = multi_sentences_to_vectors(test_token_of_sentences,wordToIdx,wordVec)
    dev_sentence_vectors = multi_sentences_to_vectors(dev_token_of_sentences,wordToIdx,wordVec)

    train_labels,dev_labels,test_labels = get_label_number_to_idx(train_labels,dev_labels,test_labels)

    return train_sentence_vectors,train_labels,dev_sentence_vectors,dev_labels,test_sentence_vectors,test_labels
コード例 #13
0
To generate vector of tokens from pre_train model.

return: 
word_vectors: the vectors of words
wordToIx: a map that the key is the word, and value is its corresponding index.
'''
def get_pre_train_vector():
    print('Please wait, pre-train...')
    sentences = word2vec.preprocessing.get_preprocessed_sentences()
    sorted_words = word2vec.preprocessing.make_vocabulary(sentences)
    word_idx, idx_word = word2vec.create_dict(sorted_words)

    sentences_in_idx = word2vec.replace_words_with_idx(sentences, word_idx)
    # this
    word_to_vec = word2vec.train(len(sorted_words), int(conf.get('param', 'word_embedding_dim')), sentences_in_idx,
                                 idx_word)

    return word_to_vec, word_idx

if __name__ == '__main__':
    labels, sentences = sentence_processing(conf.get('param', 'path_train'))

    sentences = lower_first_letter(sentences,conf.get('param','lowercase'))

    read_stoplist = read_stoplist()

    tokens, token_of_sentences = tokenization(sentences, read_stoplist)
    # idx
    wordVec, wordToIdx = randomly_initialised_vectors(tokens, threshold=0)
    # print(wordVec)
コード例 #14
0
    torch.save(model, conf.get("param", "path_model"))


def test():
    model = torch.load(conf.get('param', 'path_model'))
    model.to('cpu')

    # test the model
    acc = model.test_model(model.test_vecs,
                           model.test_label,
                           output_predict=True)
    print('test_acc: ', acc)


if __name__ == '__main__':
    # choose randomly or pre_train here
    train_sentence_vectors, train_labels, dev_sentence_vectors, dev_labels, test_sentence_vectors, test_labels = sentence_vector.bag_of_word_sentences(
        type='pre_train', freeze=True)

    output_size = len(set(train_labels))
    model = QuestionClassifier(output_size)

    for epoch in range(int(conf.get("param", "epoch"))):
        model.train_model(train_sentence_vectors, train_labels)
        # calculate correct rate of validation dataset
        acc = model.test_model(dev_sentence_vectors, dev_labels)
        print('epoch:', epoch, ' dev_acc: ', acc)
    # calculate correct rate of test dataset
    acc = model.test_model(test_sentence_vectors, test_labels)
    print('test_acc: ', acc)