def feat_most_common_words(input_data_file): articles = importArticles(input_data_file) top = 10 mcw_feature = np.zeros(shape=(len(articles), 2)) article_words = [] for article in articles: for sent in article: for word in sent.split(" "): if word not in article_words: article_words.append(word) Vb = util.Vocab() for i, article in enumerate(articles): vocab = Vb.from_corpus(article, article_words) termVector = vocab.termVector idxs = np.argsort(termVector)[::-1] top_words = [] for idx in idxs: if len(top_words) == top: break if article_words[idx].lower() not in (stopwords.words("english") + ["<unk>"]): top_words.append(article_words[idx]) mcw_feature[i][0] = sum([len(word) for word in top_words]) / top mcw_feature[i][1] = sum( [termVector[article_words.index(word)] for word in top_words]) / (top * np.count_nonzero(termVector)) return mcw_feature
def load_datasets_and_dataloaders(): train_dataset, train_dataloader = util.initialize_dataset_and_dataloader(util.config["train_data_file_path"], config["train_batch_size"], shuffle=True) validate_dataset, validate_dataloader = util.initialize_dataset_and_dataloader( util.config["validate_data_file_path"], config["validate_batch_size"]) test_dataset, test_dataloader = util.initialize_dataset_and_dataloader(util.config["test_data_file_path"], config["validate_batch_size"]) # create vocabs and assign them text_frequencies, label_frequencies = util.frequencies(train_dataset.instances) text_vocab = util.Vocab(text_frequencies) label_vocab = util.Vocab(label_frequencies) train_dataset.text_vocab = validate_dataset.text_vocab = test_dataset.text_vocab = text_vocab train_dataset.label_vocab = validate_dataset.label_vocab = test_dataset.label_vocab = label_vocab return train_dataset, train_dataloader, validate_dataset, validate_dataloader, test_dataset, test_dataloader
def feat_lda(input_data_file): # RETRIEVE ALL WORDS IN GOOD ARTICLES goodArticles = importScores('goodArticles.txt') good_article_words = [] for article in goodArticles: for sent in article: for word in sent.split(" "): if word not in good_article_words: good_article_words.append(word) #LOAD GOOD LDA MODEL ldaModel = loadModel("lda-goodArticles.model") topic_word = ldaModel.topic_word_ n_top_words = 30 n_topics = 50 topic_words = [] for i, topic_dist in enumerate(topic_word, start=0): topic_words.append( np.array(good_article_words)[np.argsort(topic_dist)] [:-n_top_words:-1]) # TEST ARTICLE articles = importArticles(input_data_file) topic_coverage = np.zeros((len(articles), n_topics * 2)) article_words = [] for article in articles: for sent in article: for word in sent.split(" "): if word not in article_words: article_words.append(word) Vb = util.Vocab() docTerm = np.zeros((len(articles), len(article_words))) print " ".join(topic_words[30]) # TOPIC COVERAGE CALCULATION for i, article in enumerate(articles): vocab = Vb.from_corpus(article, article_words) docTerm[i] = vocab.termVector for j, topic_list in enumerate(topic_words): for k, word in enumerate(topic_list): if word in vocab.w2i: idx = vocab.w2i[word] topic_coverage[i, j] += docTerm[i][idx] * (n_top_words - k) doc_topic_test = ldaModel.transform(docTerm.astype('int64')) topic_coverage[:, n_topics:] = doc_topic_test return topic_coverage
def main(): if 'train' not in args.mode: args.rnn_keep_rate = 1.0 args.fcn_keep_rate = 1.0 args.batch_size = 1 args.data_path = './data/nli/snli_1.0/snli_1.0_train.bin' if 'esim' in args.mode else './data/main/train_binary.bin' args.model_path = os.path.join(args.model_path, args.exp_name).format(args.model) print(args.model_path) if not os.path.exists(args.model_path): if 'train' not in args.mode: print(args.model_path) raise ValueError os.makedirs(args.model_path) if 'esim' in args.mode: args.batch_size = args.esim_batch_size assert 'esim' in args.model else: args.rnn_keep_rate = 1.0 args.fcn_keep_rate = 1.0 args.batch_size = args.main_batch_size print("Default model path: {}".format(args.model_path)) print('code start/ {} mode / {} model'.format(args.mode, args.model)) util.assign_specific_gpu(args.gpu_nums) vocab = util.Vocab() vardicts = util.get_pretrain_weights( args.pretrain_ckpt_path) if args.mode == 'train' else None if args.model == 'main': model = Model(vocab, args) elif args.model == 'esim': model = ESIM(vocab, args) else: raise ValueError print("model build end.") if args.mode in ['train', 'esim_train']: train(model, vocab, vardicts) elif args.mode in ['eval', 'esim_eval']: eval(model, vocab)
def feat_avg_sent_len(input_data_file): articles = importArticles(input_data_file) article_words = [] for article in articles: for sent in article: for word in sent.split(" "): if word not in article_words: article_words.append(word) avg_sent_len_articles = [] Vb = util.Vocab() for i, article in enumerate(articles): vocab = Vb.from_corpus(article, article_words) avg_sent_len_articles.append(vocab.avg_sent_len) return np.transpose(np.matrix(avg_sent_len_articles))
def feat_type_token_ratio(input_data_file): articles = importArticles(input_data_file) article_words = [] for article in articles: for sent in article: for word in sent.split(" "): if word not in article_words: article_words.append(word) Vb = util.Vocab() type_token_ratios = [] top = 20 termVectorAll = np.zeros(len(article_words)) for article in articles: vocab = Vb.from_corpus(article, article_words) type_token_ratios.append( np.count_nonzero(vocab.termVector) / np.sum(vocab.termVector)) return np.matrix(type_token_ratios)
def feat_lda(input_data_file): # RETRIEVE ALL WORDS IN GOOD ARTICLES goodArticles = importScores('goodArticles.txt') good_article_words = [] for article in goodArticles: for sent in article: for word in sent.split(" "): if word not in good_article_words: good_article_words.append(word) #LOAD GOOD LDA MODEL ldaModel = loadModel("lda-goodArticles.model") topic_word = ldaModel.topic_word_ n_top_words = 20 n_topics = 50 topic_words = [] for i, topic_dist in enumerate(topic_word, start=0): topic_words.append( np.array(good_article_words)[np.argsort(topic_dist)] [:-n_top_words:-1]) # TEST ARTICLE articles = importArticles(input_data_file) topic_coverage = np.zeros((len(articles), n_topics)) article_words = [] for article in articles: for sent in article: for word in sent.split(" "): if word not in article_words: article_words.append(word) Vb = util.Vocab() docTerm = np.zeros((len(articles), len(article_words))) # TOPIC COVERAGE CALCULATION # for i, article in enumerate(articles): # vocab = Vb.from_corpus(article, article_words) # docTerm[i] = vocab.termVector # for j, topic_list in enumerate(topic_words): # for k, word in enumerate(topic_list): # if word in vocab.w2i: # idx = vocab.w2i[word] # topic_coverage[i, j] += docTerm[i][idx] * (n_top_words - k) doc_topic_test = ldaModel.transform(docTerm.astype('int64')) normed_doc_topic_test = np.empty((len(articles), n_topics)) for i in range(doc_topic_test.shape[0]): rsum = np.sum(doc_topic_test[i]) if (rsum == 0): normed_doc_topic_test[i] = doc_topic_test[i] continue normed_doc_topic_test[i] = np.array( [e / rsum for e in doc_topic_test[i]]) # normed_matrix = normalize(doc_topic_test, axis=1, norm='l1') # topic_coverage[:,n_topics:] = doc_topic_test return doc_topic_test
for sent in article: for word in sent.split(" "): if word not in good_article_words: good_article_words.append(word) bad_article_words = [] for article in badArticles: for sent in article: for word in sent.split(" "): if word not in bad_article_words: bad_article_words.append(word) docTerm_good = np.zeros((len(goodArticles), len(good_article_words))) docTerm_bad = np.zeros((len(badArticles), len(bad_article_words))) Vb = util.Vocab() good_sent_len_sum = 0 bad_sent_len_sum = 0 type_token_ratios_good = [] type_token_ratios_bad = [] top = 20 termVector_good = np.zeros(len(good_article_words)) termVector_bad = np.zeros(len(bad_article_words)) i = 0 for good_article, bad_article in izip(goodArticles, badArticles): vocab_good = Vb.from_corpus(good_article, good_article_words)