Esempio n. 1
0
def feat_most_common_words(input_data_file):

    articles = importArticles(input_data_file)
    top = 10

    mcw_feature = np.zeros(shape=(len(articles), 2))
    article_words = []

    for article in articles:
        for sent in article:
            for word in sent.split(" "):
                if word not in article_words:
                    article_words.append(word)

    Vb = util.Vocab()

    for i, article in enumerate(articles):
        vocab = Vb.from_corpus(article, article_words)
        termVector = vocab.termVector
        idxs = np.argsort(termVector)[::-1]
        top_words = []
        for idx in idxs:
            if len(top_words) == top:
                break
            if article_words[idx].lower() not in (stopwords.words("english") +
                                                  ["<unk>"]):
                top_words.append(article_words[idx])
        mcw_feature[i][0] = sum([len(word) for word in top_words]) / top
        mcw_feature[i][1] = sum(
            [termVector[article_words.index(word)]
             for word in top_words]) / (top * np.count_nonzero(termVector))

    return mcw_feature
Esempio n. 2
0
def load_datasets_and_dataloaders():
    train_dataset, train_dataloader = util.initialize_dataset_and_dataloader(util.config["train_data_file_path"],
                                                                             config["train_batch_size"], shuffle=True)

    validate_dataset, validate_dataloader = util.initialize_dataset_and_dataloader(
        util.config["validate_data_file_path"], config["validate_batch_size"])
    test_dataset, test_dataloader = util.initialize_dataset_and_dataloader(util.config["test_data_file_path"],
                                                                           config["validate_batch_size"])

    # create vocabs and assign them
    text_frequencies, label_frequencies = util.frequencies(train_dataset.instances)
    text_vocab = util.Vocab(text_frequencies)
    label_vocab = util.Vocab(label_frequencies)

    train_dataset.text_vocab = validate_dataset.text_vocab = test_dataset.text_vocab = text_vocab
    train_dataset.label_vocab = validate_dataset.label_vocab = test_dataset.label_vocab = label_vocab

    return train_dataset, train_dataloader, validate_dataset, validate_dataloader, test_dataset, test_dataloader
def feat_lda(input_data_file):

    # RETRIEVE ALL WORDS IN GOOD ARTICLES
    goodArticles = importScores('goodArticles.txt')

    good_article_words = []
    for article in goodArticles:
        for sent in article:
            for word in sent.split(" "):
                if word not in good_article_words:
                    good_article_words.append(word)

    #LOAD GOOD LDA MODEL
    ldaModel = loadModel("lda-goodArticles.model")
    topic_word = ldaModel.topic_word_
    n_top_words = 30
    n_topics = 50
    topic_words = []

    for i, topic_dist in enumerate(topic_word, start=0):
        topic_words.append(
            np.array(good_article_words)[np.argsort(topic_dist)]
            [:-n_top_words:-1])

    # TEST ARTICLE
    articles = importArticles(input_data_file)
    topic_coverage = np.zeros((len(articles), n_topics * 2))
    article_words = []

    for article in articles:
        for sent in article:
            for word in sent.split(" "):
                if word not in article_words:
                    article_words.append(word)

    Vb = util.Vocab()
    docTerm = np.zeros((len(articles), len(article_words)))

    print " ".join(topic_words[30])
    # TOPIC COVERAGE CALCULATION
    for i, article in enumerate(articles):
        vocab = Vb.from_corpus(article, article_words)
        docTerm[i] = vocab.termVector

        for j, topic_list in enumerate(topic_words):
            for k, word in enumerate(topic_list):
                if word in vocab.w2i:
                    idx = vocab.w2i[word]
                    topic_coverage[i, j] += docTerm[i][idx] * (n_top_words - k)

    doc_topic_test = ldaModel.transform(docTerm.astype('int64'))
    topic_coverage[:, n_topics:] = doc_topic_test

    return topic_coverage
def main():
    if 'train' not in args.mode:
        args.rnn_keep_rate = 1.0
        args.fcn_keep_rate = 1.0
        args.batch_size = 1

    args.data_path = './data/nli/snli_1.0/snli_1.0_train.bin' if 'esim' in args.mode else './data/main/train_binary.bin'

    args.model_path = os.path.join(args.model_path,
                                   args.exp_name).format(args.model)
    print(args.model_path)
    if not os.path.exists(args.model_path):
        if 'train' not in args.mode:
            print(args.model_path)
            raise ValueError
        os.makedirs(args.model_path)

    if 'esim' in args.mode:
        args.batch_size = args.esim_batch_size
        assert 'esim' in args.model
    else:
        args.rnn_keep_rate = 1.0
        args.fcn_keep_rate = 1.0
        args.batch_size = args.main_batch_size

    print("Default model path: {}".format(args.model_path))

    print('code start/ {} mode / {} model'.format(args.mode, args.model))
    util.assign_specific_gpu(args.gpu_nums)

    vocab = util.Vocab()

    vardicts = util.get_pretrain_weights(
        args.pretrain_ckpt_path) if args.mode == 'train' else None

    if args.model == 'main':
        model = Model(vocab, args)
    elif args.model == 'esim':
        model = ESIM(vocab, args)
    else:
        raise ValueError
    print("model build end.")

    if args.mode in ['train', 'esim_train']:
        train(model, vocab, vardicts)
    elif args.mode in ['eval', 'esim_eval']:
        eval(model, vocab)
Esempio n. 5
0
def feat_avg_sent_len(input_data_file):

    articles = importArticles(input_data_file)

    article_words = []
    for article in articles:
        for sent in article:
            for word in sent.split(" "):
                if word not in article_words:
                    article_words.append(word)

    avg_sent_len_articles = []

    Vb = util.Vocab()
    for i, article in enumerate(articles):
        vocab = Vb.from_corpus(article, article_words)
        avg_sent_len_articles.append(vocab.avg_sent_len)

    return np.transpose(np.matrix(avg_sent_len_articles))
Esempio n. 6
0
def feat_type_token_ratio(input_data_file):
    articles = importArticles(input_data_file)
    article_words = []
    for article in articles:
        for sent in article:
            for word in sent.split(" "):
                if word not in article_words:
                    article_words.append(word)

    Vb = util.Vocab()
    type_token_ratios = []
    top = 20
    termVectorAll = np.zeros(len(article_words))

    for article in articles:
        vocab = Vb.from_corpus(article, article_words)
        type_token_ratios.append(
            np.count_nonzero(vocab.termVector) / np.sum(vocab.termVector))

    return np.matrix(type_token_ratios)
Esempio n. 7
0
def feat_lda(input_data_file):

    # RETRIEVE ALL WORDS IN GOOD ARTICLES
    goodArticles = importScores('goodArticles.txt')

    good_article_words = []
    for article in goodArticles:
        for sent in article:
            for word in sent.split(" "):
                if word not in good_article_words:
                    good_article_words.append(word)

    #LOAD GOOD LDA MODEL
    ldaModel = loadModel("lda-goodArticles.model")
    topic_word = ldaModel.topic_word_
    n_top_words = 20
    n_topics = 50
    topic_words = []

    for i, topic_dist in enumerate(topic_word, start=0):
        topic_words.append(
            np.array(good_article_words)[np.argsort(topic_dist)]
            [:-n_top_words:-1])

    # TEST ARTICLE
    articles = importArticles(input_data_file)
    topic_coverage = np.zeros((len(articles), n_topics))
    article_words = []

    for article in articles:
        for sent in article:
            for word in sent.split(" "):
                if word not in article_words:
                    article_words.append(word)

    Vb = util.Vocab()
    docTerm = np.zeros((len(articles), len(article_words)))

    # TOPIC COVERAGE CALCULATION
    # for i, article in enumerate(articles):
    # 	vocab = Vb.from_corpus(article, article_words)
    # 	docTerm[i] = vocab.termVector

    # 	for j, topic_list in enumerate(topic_words):
    # 		for k, word in enumerate(topic_list):
    # 			if word in vocab.w2i:
    # 				idx = vocab.w2i[word]
    # 				topic_coverage[i, j] += docTerm[i][idx] * (n_top_words - k)

    doc_topic_test = ldaModel.transform(docTerm.astype('int64'))
    normed_doc_topic_test = np.empty((len(articles), n_topics))
    for i in range(doc_topic_test.shape[0]):
        rsum = np.sum(doc_topic_test[i])
        if (rsum == 0):
            normed_doc_topic_test[i] = doc_topic_test[i]
            continue
        normed_doc_topic_test[i] = np.array(
            [e / rsum for e in doc_topic_test[i]])
    # normed_matrix = normalize(doc_topic_test, axis=1, norm='l1')

    # topic_coverage[:,n_topics:] = doc_topic_test

    return doc_topic_test
Esempio n. 8
0
        for sent in article:
            for word in sent.split(" "):
                if word not in good_article_words:
                    good_article_words.append(word)

    bad_article_words = []
    for article in badArticles:
        for sent in article:
            for word in sent.split(" "):
                if word not in bad_article_words:
                    bad_article_words.append(word)

    docTerm_good = np.zeros((len(goodArticles), len(good_article_words)))
    docTerm_bad = np.zeros((len(badArticles), len(bad_article_words)))

    Vb = util.Vocab()

    good_sent_len_sum = 0
    bad_sent_len_sum = 0

    type_token_ratios_good = []
    type_token_ratios_bad = []

    top = 20
    termVector_good = np.zeros(len(good_article_words))
    termVector_bad = np.zeros(len(bad_article_words))

    i = 0
    for good_article, bad_article in izip(goodArticles, badArticles):

        vocab_good = Vb.from_corpus(good_article, good_article_words)