def reuters_dataset(): nltk.download('reuters') nltk.download('stopwords') stop_words = stopwords.words("english") documents = reuters.fileids() train_docs_id = [doc for doc in documents if doc.startswith("train")] test_docs_id = [doc for doc in documents if doc.startswith("test")] train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] print(len(train_docs), len(test_docs)) vectorizer = TfidfVectorizer(stop_words=stop_words) vectorised_train_documents = vectorizer.fit_transform(train_docs) vectorised_test_documents = vectorizer.transform(test_docs) # print([reuters.categories(doc_id) for doc_id in test_docs_id]) mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train_docs_id]) test_labels = mlb.transform( [reuters.categories(doc_id) for doc_id in test_docs_id]) return vectorised_train_documents.toarray( ), vectorised_test_documents.toarray(), train_labels, test_labels
def load_data(config={}): stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words, binary=True) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = { 'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': reuters.categories() } print(data['x_train']) print(data['y_train']) return data, vectorizer.vocabulary_
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents") train_docs = list(filter(lambda doc: doc.startswith("train"), documents)) print(str(len(train_docs)) + " total train documents") test_docs = list(filter(lambda doc: doc.startswith("test"), documents)) print(str(len(test_docs)) + " total test documents") # List of categories categories = reuters.categories() print(str(len(categories)) + " categories") # Documents in a category category_docs = reuters.fileids("acq") # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]) print(document_words) # Raw document print(reuters.raw(document_id)) tokens = [] for docid in train_docs: t = tokenize(reuters.raw(docid)) tokens.extend(t) print(tokens[0]) v = set(tokens) print("number of terms=", len(tokens)) print("voc size=", len(v))
def load_data(): """ Load the Reuters dataset. Returns ------- train_docs, train_labels, test_docs, test_labels. """ documents = reuters.fileids() train = [d for d in documents if d.startswith('training/')] train_docs = [reuters.raw(doc_id) for doc_id in train] train_docs = [text_prepare(x) for x in train_docs] train_labels = [reuters.categories(doc_id) for doc_id in train] test = [d for d in documents if d.startswith('test/')] test_docs = [reuters.raw(doc_id) for doc_id in test] test_docs = [text_prepare(x) for x in test_docs] test_labels = [reuters.categories(doc_id) for doc_id in test] print("len(train_docs)={}, len(train_labels)={}".format( len(train_docs), len(train_labels))) print("len(test_docs)={}, len(test_labels)={}".format( len(test_docs), len(test_labels))) mlb = MultiLabelBinarizer(classes=sorted(labels)) train_labels = mlb.fit_transform(train_labels) test_labels = mlb.fit_transform(test_labels) print("y_train.shape={}, y_test.shape={}".format(train_labels.shape, test_labels.shape)) return (train_docs, train_labels, test_docs, test_labels, mlb.classes)
def get_reuters(): # Documents in a category category_trade = reuters.fileids("trade") category_money = reuters.fileids("money-fx") + reuters.fileids( "money-supply") category_interest = reuters.fileids("interest") text_trade = [] for i in range(0, len(category_trade)): text_trade.append(reuters.raw(category_trade[i])) text_money = [] for i in range(0, len(category_money)): text_money.append(reuters.raw(category_money[i])) text_interest = [] for i in range(0, len(text_interest)): text_interest.append(reuters.raw(text_interest[i])) trade_data = pd.DataFrame(data={'text': text_trade}) trade_data['category'] = 'TRADE' money_data = pd.DataFrame(data={'text': text_money}) money_data['category'] = 'MONEY' interest_data = pd.DataFrame(data={'text': text_interest}) interest_data['category'] = 'INTEREST' # picking only relevant columns selected_columns = ['text', 'category'] df = trade_data df = df.append(money_data) df = df.append(interest_data) return df
def get_word2idx(): import cli config = cli.config docs, label_seqs, decode_inp, seq_len = load_hclf_reuters(config, "train") docs_train = [tokenize(reuters.raw(doc_id)) for doc_id in docs] docs, label_seqs, decode_inp, seq_len = load_hclf_reuters(config, "test") docs_test = [tokenize(reuters.raw(doc_id)) for doc_id in docs] docs = docs_train + docs_test max_docs_length = 0 word2idx = Counter() word2idx["UNK"] = 0 word2idx["NULL"] = 1 # for pad idx2word = [] idx2word += ["UNK", "NULL"] for doc in docs: max_docs_length = len( doc) if len(doc) > max_docs_length else max_docs_length for token in doc: if token not in word2idx: word2idx[token] = len(word2idx) idx2word += [token] print(len(word2idx)) #for i in range(len(idx2word)): # print(idx2word[i], word2idx[idx2word[i]]) shared = {"word2idx": word2idx, "idx2word": idx2word} json.dump(shared, open("data/word2idx_new.json", "w"))
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = {'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"]} return data
def prepareArticles(documentsIds=reuters.fileids()): """ Organizes articles in a dictionary structure @params documentsIds: a list of document ids - by default get reuter's documents ids @rtype {Dictionary} """ allDocs = [] for id in documentsIds: doc = { 'title': getDocTitle(reuters.raw(fileids=id)), 'size': len(reuters.raw(fileids=id)), 'text': removeTitleFromText( reuters.raw(fileids=id).replace('\n', ''), getDocTitle(reuters.raw(fileids=id))), 'id': id } allDocs.append(doc) sortedDocs = sorted(allDocs, key=lambda x: x['size']) suitableDocs = [ doc for doc in sortedDocs if doc['size'] >= 1000 and doc['size'] <= 2000 ] suitableDocsIds = [ at(doc, 'id', 'title', 'text') for doc in suitableDocs if len(nltk.sent_tokenize(doc['text'])) > 3 ] return suitableDocsIds
def extract_subset_data(seed=1337): train_data = {} test_data = {} random.seed(seed) # np.random.choice() for (label, train_amount, test_amount) in LABELS: train_category_id = list( filter(lambda x_train: x_train.startswith('train'), reuters.fileids(label))) # list of ids in train category random.shuffle(train_category_id) train_data[label] = [ preprocessing(reuters.raw(train)) for train in train_category_id[:train_amount] ] # processed subset test_category_id = list( filter(lambda x_test: x_test.startswith('test'), reuters.fileids(label))) # list of ids in test category random.shuffle(test_category_id) test_data[label] = [ preprocessing(reuters.raw(test)) for test in test_category_id[:test_amount] ] # processed subset return train_data, test_data
def getDocIDs_top10(): # Top 10 Categories documents = [ f for f in reuters.fileids() if len(reuters.categories(fileids=f)) == 1 ] train_docs_id = list( filter( lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51, documents)) test_docs_id = list( filter( lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51, documents)) new_train_docs_id = [] new_test_docs_id = [] for cat in reuters.categories(): li = [f for f in reuters.fileids(categories=cat) if f in train_docs_id] li_te = [ f for f in reuters.fileids(categories=cat) if f in test_docs_id ] if len(li) > 20 and len(li_te) > 20: new_train_docs_id.extend(li) new_test_docs_id.extend(li_te) train_docs_id = new_train_docs_id test_docs_id = new_test_docs_id return (train_docs_id, test_docs_id)
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = { 'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"] } return data
def get_raw_data(): nltk.download("reuters") from nltk.corpus import reuters documents = reuters.fileids() train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) X_train = [(reuters.raw(doc_id)) for doc_id in train_docs_id] X_test = [(reuters.raw(doc_id)) for doc_id in test_docs_id] mlb = MultiLabelBinarizer() y_train = [reuters.categories(doc_id) for doc_id in train_docs_id] y_test = [reuters.categories(doc_id) for doc_id in test_docs_id] all_dataa = X_train + X_test all_lavelsa = y_train + y_test mlb = MultiLabelBinarizer() datas_y = mlb.fit_transform(all_lavelsa) return all_dataa,all_lavelsa
def get_data_splits(): train_docs, train_labels = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')]) test_docs, test_labels = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')]) return train_docs, train_labels, test_docs, test_labels
def main(): train_docs = [] # contains train document numbers test_docs = [] # contains test document numbers train_category_docs = {} # contains category corresponding train documents test_category_docs = {} # contains category corresponding test documents train_data = {} # contains train document numbers corresponding data test_data = {} # contains test document numbers corresponding data categories = reuters.categories() # Total categories list #print categories #print "Category Name" + " <------------------> " + "No of Train documents in each Category" with open("category_train_docs.csv", "wb") as f: writer = csv.writer(f, quoting=csv.QUOTE_ALL) for category_name in categories: category_docs = reuters.fileids(category_name) #print category_name + " <------------------> " + str(len(category_docs)) train_list = [] test_list = [] for category_id in category_docs: if category_id.startswith("train"): train_list.append(category_id.split('/')[1]) else: test_list.append(category_id.split('/')[1]) writer.writerow([category_name] + train_list) #test_category_docs[category_name] = test_list #train_category_docs[category_name] = train_list exit() for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(doc_id) train_data[doc_id] = tokenize(reuters.raw(doc_id)) doc_number = doc_id.split('/')[1] build_index_train(tokenize(reuters.raw(doc_id)), doc_number) #train_docs.append(reuters.raw(doc_id)) else: test_docs.append(doc_id) test_data[doc_id] = tokenize(reuters.raw(doc_id)) doc_number = doc_id.split('/')[1] build_index_test(tokenize(reuters.raw(doc_id)), doc_number) #print train_data with open("inverted_train_index.csv", "wb") as f: writer = csv.writer(f, quoting=csv.QUOTE_ALL) for words in inverted_index_train: if len(inverted_index_train[words]) >= 3: inverted_index_train_pruned[words] = ( inverted_index_train[words]) writer.writerow([words] + inverted_index_train_pruned[words]) for words in inverted_index_test: if len(inverted_index_test[words]) >= 3: inverted_index_test_pruned[words] = inverted_index_test[words]
def fetch_raw(self, dummy_input): # only applicable for raw dataset if self.name != self.base_name: return None info("Downloading raw {} dataset".format(self.name)) if not self.nltk_dataset_resource_exists(Reuters.name): nltk_download(self.config, "reuters") # get ids categories = reuters.categories() self.num_labels = len(categories) self.label_names = [] # train / test labels samples = {} train_docs, test_docs = [], [] doc2labels = {} # get content for cat_index, cat in enumerate(categories): samples[cat] = [0, 0] # get all docs in that category for doc in reuters.fileids(cat): # document to label mappings if doc not in doc2labels: # not encountered: init document label list doc2labels[doc] = [] if doc.startswith("training"): train_docs.append(doc) else: test_docs.append(doc) # count samples if doc.startswith("training"): samples[cat][0] += 1 else: samples[cat][1] += 1 # append the label doc2labels[doc].append(cat_index) doc2labels, label_set = self.delete_no_sample_labels( samples, doc2labels) self.train, self.test = [], [] self.train_labels, self.test_labels = [], [] # assign label lists for doc in train_docs: self.train.append(reuters.raw(doc)) self.train_labels.append(doc2labels[doc]) for doc in test_docs: self.test.append(reuters.raw(doc)) self.test_labels.append(doc2labels[doc]) self.label_names = label_set # self.labelset = list(sorted(set(self.train_labels))) self.roles = "train", "test" info("Loaded {} train & {} test instances.".format( len(self.train), len(self.test))) return self.get_all_raw()
def _extract(ids): X_train = [ re.sub('[ \t\n]+', ' ', reuters.raw(i)) for i in ids if 'train' in i ] X_test = [ re.sub('[ \t\n]+', ' ', reuters.raw(i)) for i in ids if 'test' in i ] return X_train, X_test
def __init__(self, min_eic=5): self.test_classes = [] self.test_docs = [] self.train_classes = [] self.train_docs = [] self.table_of_classes = [] self.num_of_instances = [] # mininimal encounter in classes self.min_eic = min_eic if Path("training_cache/train_docs").is_file() and Path("training_cache/train_classes").is_file() \ and Path("training_cache/test_docs").is_file() and Path("training_cache/test_classes").is_file() \ and Path("classify_cache/table_of_classes").is_file(): self.train_docs = joblib.load("training_cache/train_docs") self.train_classes = joblib.load("training_cache/train_classes") self.test_docs = joblib.load("training_cache/test_docs") self.test_classes = joblib.load("training_cache/test_classes") self.table_of_classes = joblib.load( "classify_cache/table_of_classes") else: raw_test_classes = [] raw_train_classes = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): self.train_docs.append( prepare_text_for_analysis(reuters.raw(doc_id))) raw_train_classes.append(reuters.categories(doc_id)) else: self.test_docs.append( prepare_text_for_analysis(reuters.raw(doc_id))) raw_test_classes.append(reuters.categories(doc_id)) self.make_table_of_classes(raw_train_classes) self.train_classes = self.transform_classes( raw_train_classes, "train") self.test_classes = self.transform_classes(raw_test_classes, "test") joblib.dump(self.train_docs, "training_cache/train_docs", compress=9) joblib.dump(self.train_classes, "training_cache/train_classes", compress=9) joblib.dump(self.test_docs, "training_cache/test_docs", compress=9) joblib.dump(self.test_classes, "training_cache/test_classes", compress=9) joblib.dump(self.table_of_classes, "classify_cache/table_of_classes", compress=9)
def load_test_data(): for id in test_corn_ids: test_corn_target.append(0) test_corn.append(reuters.raw(id)) for id in test_wheat_ids: test_wheat_target.append(1) test_wheat.append(reuters.raw(id)) test = test_corn + test_wheat test_target = test_corn_target + test_wheat_target return test, test_target
def reuters_dataset(directory='../data', train=True, test=False, clean_txt=False): """ Load the Reuters-21578 dataset. Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. test (bool, optional): If to load the test split of the dataset. Returns: :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`: Returns between one and all dataset splits (train and test) depending on if their respective boolean argument is ``True``. """ # nltk.download('reuters', download_dir=directory) if directory not in nltk.data.path: nltk.data.path.append(directory) doc_ids = reuters.fileids() ret = [] splits = [ split_set for (requested, split_set) in [(train, 'train'), (test, 'test')] if requested ] for split_set in splits: split_set_doc_ids = list( filter(lambda doc: doc.startswith(split_set), doc_ids)) examples = [] for id in split_set_doc_ids: if clean_txt: text = clean_text(reuters.raw(id)) else: text = ' '.join(word_tokenize(reuters.raw(id))) labels = reuters.categories(id) examples.append({ 'text': text, 'label': labels, }) ret.append(Dataset(examples)) if len(ret) == 1: return ret[0] else: return tuple(ret)
def main(): train_docs = [] test_docs = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(reuters.raw(doc_id)) else: test_docs.append(reuters.raw(doc_id)) representer = tf_idf(train_docs) for doc in test_docs: print(feature_values(doc, representer))
def main(): collection_stats() print("Staring classifier ..") X_train = list() X_test = list() y_train = list() y_test = list() print("Reading training and testing data ..") for doc_id in reuters.fileids(): if doc_id.startswith("train"): X_train.append(reuters.raw(doc_id)) y_train.append(reuters.categories(doc_id)) else: X_test.append(reuters.raw(doc_id)) y_test.append(reuters.categories(doc_id)) X_train = numpy.array(X_train) y_train = numpy.array(y_train) X_test = numpy.array(X_test) y_test = numpy.array(y_test) binarizer = MultiLabelBinarizer(classes=reuters.categories()) classifier = Pipeline([ ('vectorizer', TfidfVectorizer(tokenizer=tokenize, min_df=0, max_df=0.90, max_features=3000, use_idf=True, sublinear_tf=True)), # ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LogisticRegression())) ]) print("Training classifier ..") classifier.fit(X_train, binarizer.fit_transform(y_train)) print("Testing classifier ..") res = classifier.predict(X_test) hard_precision = classifier.score(X_test, binarizer.transform(y_test)) precision = average_precision_score(res, binarizer.fit_transform(y_test), average=None) recall = recall_score(res, binarizer.fit_transform(y_test), average=None) f1score = f1_score(res, binarizer.fit_transform(y_test), average=None) print("Hard precision: " + str(hard_precision)) log_results(reuters.categories(), precision, recall, f1score)
def main(): train_docs = [] test_docs = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(reuters.raw(doc_id)) else: test_docs.append(reuters.raw(doc_id)) representer = tf_idf(train_docs);
def load_train_data(): train = [] train_target = [] for id in train_corn_ids: train_corn_target.append(0) train_corn.append(reuters.raw(id)) for id in train_wheat_ids: train_wheat_target.append(1) train_wheat.append(reuters.raw(id)) train = train_corn + train_wheat train_target = train_corn_target + train_wheat_target return train, train_target
def getDocIDs_90(): # 90 Categories documents = reuters.fileids() train_docs_id = list( filter( lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51, documents)) test_docs_id = list( filter( lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51, documents)) return (train_docs_id, test_docs_id)
def get_default_split(): documents = reuters.fileids() train_docs_id = list( filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list( filter(lambda doc: doc.startswith("test"), documents)) X_train = [reuters.raw(doc_id) for doc_id in train_docs_id] X_test = [reuters.raw(doc_id) for doc_id in test_docs_id] Y_train = [reuters.categories(doc_id) for doc_id in train_docs_id] Y_test = [reuters.categories(doc_id) for doc_id in test_docs_id] return X_train, Y_train, X_test, Y_test
def load_data(valid_percent=0.1): """ Load the Reuters dataset. Returns: raw text and raw labels for train, valid, test set. """ nltk.download('reuters') n_classes = 90 labels = reuters.categories() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] ys = {'train': [], 'test': []} ys['train'] = [reuters.categories(doc_id) for doc_id in train] ys['test'] = [reuters.categories(doc_id) for doc_id in test] # Validation n_valid = int(valid_percent * len(ys['train'])) np.random.seed(5) idxs = np.random.choice(len(ys['train']), n_valid, replace=False) idx_set = set(idxs) docs['valid'] = [] ys['valid'] = [] train_docs = [] train_y = [] for idx, (x, y) in enumerate(zip(docs['train'], ys['train'])): if idx in idx_set: docs['valid'].append(x) ys['valid'].append(y) else: train_docs.append(x) train_y.append(y) data = { 'x_train': train_docs, 'y_train': train_y, 'x_valid': docs['valid'], 'y_valid': ys['valid'], 'x_test': docs['test'], 'y_test': ys['test'], 'labels': labels } return data
def stats(self): """ :return: Important statistics about the dataset - numbers of documents in different classes with corresponding percentages, as well as vocabulary sizes for every class. """ lt = LemmaTokenizer() train_stats = {} test_stats = {} for c in reuters.categories(): train_stats[c] = { 'num_of_docs': 0, 'percentage': 0.0, 'words': set([]) } test_stats[c] = { 'num_of_docs': 0, 'percentage': 0.0, 'words': set([]) } for d in self.train: c = reuters.categories(d)[0] train_stats[c]['num_of_docs'] += 1 train_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d))) for d in self.test: c = reuters.categories(d)[0] test_stats[c]['num_of_docs'] += 1 test_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d))) s_train = sum(train_stats[c]['num_of_docs'] for c in train_stats.keys()) s_test = sum(test_stats[c]['num_of_docs'] for c in test_stats.keys()) res = ({}, {}) for c in train_stats.keys(): if train_stats[c]['num_of_docs'] != 0: train_stats[c][ 'percentage'] = train_stats[c]['num_of_docs'] / s_train train_stats[c]['words'] = len(train_stats[c]['words']) res[0][c] = train_stats[c] for c in test_stats.keys(): if test_stats[c]['num_of_docs'] != 0: test_stats[c][ 'percentage'] = test_stats[c]['num_of_docs'] / s_test test_stats[c]['words'] = len(test_stats[c]['words']) res[1][c] = test_stats[c] return res
def main(): train_docs = [] test_docs = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(reuters.raw(doc_id)) else: test_docs.append(reuters.raw(doc_id)) representer = tf_idf(train_docs); for doc in test_docs: print(feature_values(doc, representer))
def compute_idf(): words = set() for fileid in reuters.fileids(): tokens = tokenize(reuters.raw(fileid)) words.update(tokens) idf = dict.fromkeys(words, 0) for fileid in reuters.fileids(): tokens = set(tokenize(reuters.raw(fileid))) for token in tokens: idf[token] += 1 total = len(reuters.fileids()) for word in words: idf[word] = math.log(total / (1 + idf[word])) return idf
def load_data(): docs = reuters.fileids() train_ids = [doc for doc in docs if doc.startswith("train")] test_ids = [doc for doc in docs if doc.startswith("test")] train_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0]) for id in train_ids], columns=('text', 'labels')) test_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0]) for id in test_ids], columns=('text', 'labels')) return train_data, test_data
def load(self): logger.info('Starting processing reuters dataset.') self.df = pd.DataFrame([{ 'doc_id': doc_id, 'abspath': str(reuters.abspath(doc_id)), 'categories': [c + ' ' for c in reuters.categories(doc_id)], 'headline': reuters.raw(doc_id).split('\n', 1)[0], 'length': len(reuters.raw(doc_id)) } for doc_id in reuters.fileids()]) logger.info('Finishing processing reuters dataset.')
def get_train_test_reauter_data(): train_docs = [] test_docs = [] for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(reuters.raw(doc_id)) else: test_docs.append(reuters.raw(doc_id)) sliceObject = slice(5) train_docs = train_docs[sliceObject] test_docs = test_docs[sliceObject] return train_docs, test_docs
def collection_stats(): # List of documents documents = reuters.fileids() print(str(len(documents)) + " documents"); train_docs = list(filter(lambda doc: doc.startswith("train"), documents)); print(str(len(train_docs)) + " total train documents"); test_docs = list(filter(lambda doc: doc.startswith("test"), documents)); print(str(len(test_docs)) + " total test documents"); # List of categories categories = reuters.categories(); print(str(len(categories)) + " categories"); # Documents in a category category_docs = reuters.fileids("acq"); # Words for a document document_id = category_docs[0] document_words = reuters.words(category_docs[0]); print(document_words); # Raw document print(reuters.raw(document_id));
def main(): train_docs = [] test_docs = [] collection_stats() for doc_id in reuters.fileids(): if doc_id.startswith("train"): train_docs.append(reuters.raw(doc_id)) else: test_docs.append(reuters.raw(doc_id)) representer = tf_idf(train_docs); classifier = nltk.NaiveBayesClassifier.train (train_docs) print nltk.classify.accuracy (classifier, test_docs)
def run(): """Import the Reuters Corpus which contains 10,788 news articles""" from nltk.corpus import reuters raw_docs = [reuters.raw(fileid) for fileid in reuters.fileids()] # Select 100 documents randomly rand_idx = random.sample(range(len(raw_docs)), 100) raw_docs = [raw_docs[i] for i in rand_idx] # Preprocess Documents tokenized_docs = [ie_preprocess(doc) for doc in raw_docs] # Remove single occurance words docs = remove_infrequent_words(tokenized_docs) # Create dictionary and corpus dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] # Build LDA model lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10) for topic in lda.show_topics(): print topic
def create_tfidf_data(docs,categories,n=None): """ Crea una struttura [(label,[parole])] parsando il documento :param docs: lista dei documenti reuters :param categories: nomi delle categorie da considerare :param n: numero di documenti da usare :return: list """ if n: docs = docs[:n] cat_num = {}; i = 1 for c in categories: cat_num[c] = i i += 1 y = [] corpus = [] for d in docs: c = reuters.categories(d)[0] if c in categories: y.append(getSVMCategory(cat_num[c])) corpus.append(reuters.raw(d).lower()) return y, corpus
def format_data(docs, all_categories): y = []; corpus = [] for d in docs: current_categories = filter(lambda x: x in all_categories,reuters.categories(d)) if current_categories: y.append(current_categories[0]) corpus.append(reuters.raw(d).lower()) return y, corpus
def construct_freq(self, text = None): if text == None: from nltk.corpus import reuters text = reuters.raw() #Get rid of \n c_text = text.replace('\n','') self.letter_freq = nltk.FreqDist(c_text) for bigram in nltk.bigrams(c_text): self.letter_freq.inc(''.join(bigram))
def __iter__(self): """ Generator of docs while collecting ordered structured info. """ for n, reutersid in enumerate(reuters.fileids()): # 'training|test/xxxx' dataset, _ = reutersid.split('/') # extract dataset if self.dataset in dataset: # yield only filtered dataset if self.categories is not None: top_category = reuters.categories(reutersid)[0] # grab first category only self.category_mask.append(self.categories[top_category]) # n-th doc -> classid yield reuters.raw(reutersid) # return raw document
def get_raw_text(corpus,file_name): string='' if corpus=='mr': from nltk.corpus import movie_reviews string = movie_reviews.raw(fileids=file_name) else: from nltk.corpus import reuters string = reuters.raw(fileids=file_name) return string
def write_into_new_file(file_name): #stopwords_english = set(stopwords.words('english')) string = reuters.raw(fileids=file_name) list_words = re.split(r'\W+',string) new_file_path = new_path+file_name file_wr = open(new_file_path, "w") for w in list_words: if w.isalpha() and len(w)>1 and w.lower() not in stopwords_english: file_wr.write(w.lower()+"\n") file_wr.close()
def get_list_tokens_nltk(corpus, file_name): string='' if corpus=='mr': from nltk.corpus import movie_reviews string = movie_reviews.raw(fileids=file_name) else: from nltk.corpus import reuters string = reuters.raw(fileids=file_name) list_words = re.split(r'\W+',string) return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in stopwords_english]
def build_TFIDF_model(self): """ Build term-document matrix containing TF-IDF score for each word in each document in the Reuters corpus (via NLTK). """ token_dict = {} for article in reuters.fileids(): token_dict[article] = reuters.raw(article) # Use TF-IDF to determine frequency of each word in our article, relative to the # word frequency distributions in corpus of 11k Reuters news articles. self._tfidf = TfidfVectorizer(tokenizer=self.tokenize_and_stem, stop_words='english', decode_error='ignore') tdm = self._tfidf.fit_transform(token_dict.values()) # Term-document matrix
def create_tfidf_data(docs,n=None): """ Crea una struttura [(label,[parole])] togliendo le stopwords e parsando il documento :param docs: lista dei documenti reuters :param n: numero di documenti da usare :return: list """ if n: docs = docs[:n] y = [reuters.categories(d)[0] for d in docs] corpus = [reuters.raw(d).lower() for d in docs] return y, corpus
def list_doc_topics(doc_test_topics,doc_train_topics): """ creates a list of two-tuples that contain a single feature entry and the body text. """ ref_docs = [] ref_docs_test=[] for d in doc_train_topics: t1=d.split()[1:] d0 = rt0.raw(d.split()[0]) d0= d0.replace('\n','') for t in t1: d_tup = (t, d0) ref_docs.append(d_tup) for d in doc_test_topics: t2=d.split()[1:] d00 = rt0.raw(d.split()[0]) d00= d00.replace('\n','') for t in t2: d_tup = (t, d00) ref_docs_test.append(d_tup) return ref_docs,ref_docs_test
def getDocContentById(self, docId): ''' Gets the document content by its id Parameters ---------- docId : string The document id. Returns ------- The document content (unicode) ''' return reuters.raw(docId)
def benchmark(storage_class, create=True): m = storage_class(4) s = Segmenter(m, 3) if create: m.clear() corpus = reuters.raw() tokens = list(filter(lambda t: t.category == '', tokeniser_fr(corpus)))[:10000] if create: m.add_sentence(tokens) for i in range(1,5000,30): print(s.segment(tokens[i:i+30]))
def summarize(storyid): stopwords = ['a','an','and','are','as','at','be','but','by','for','if','in','into','is','it','no','not','of','on','or','s','such','t','that','the','their','then','there','these','they','this','to','was','will','with'] text="".join(reuters.raw(storyid).split("\n")) dictW=defaultdict(int) sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ls=sent_detector.tokenize(text.strip()) for l in ls: tokens=tokenizer.tokenize(l) for token in tokens: if token.lower not stopwords: if token not in dictW: dictW[token]=1 else: dictW[token]+=1 title=ls[:1] ls=ls[1:] MAX_SUMMARY_SIZE=int(0.20*len(ls)) ls.sort(key=lambda s: sum((dictW[token] for token in tokenizer.tokenize(s))), reverse=1) ls= ls[:MAX_SUMMARY_SIZE] ls.sort(lambda s1, s2:text.find(s1)-text.find(s2)) ls=title+ls print "".join(ls) print
#for use with sklearn def myparser(s): punc='[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n ]' #all punc+whtspc+newline np=[a for a in re.split(punc,s) if a not in string.punctuation] low=[a.lower() for a in np if len(a)>2] #only two-lett words lowered nostop=[a for a in low if a not in stopwords.words('english')] return [porter.stem(a) for a in nostop if re.findall(r"[^\W\d]",a)] #imports from sklearn.feature_extraction.text import TfidfVectorizer #object instantiation - ignore utf-8 decode errors vectfidf=TfidfVectorizer(tokenizer=myparser,decode_error='ignore') #test corpus #corpus=[reuters.raw('training/9853'),reuters.raw('training/9866')] #reuters corpus corpus=[(reuters.raw(fileid),cat) for cat in reuters.categories() for fileid in reuters.fileids(cat)] random.seed(1979) random.shuffle(corpus) size=int(len(corpus)*0.1) train_raw=corpus[size:] test_raw=corpus[:size] train_raw_data=[a[0] for a in train_raw] test_raw_data=[a[0] for a in test_raw] y_train=[a[1] for a in train_raw] y_test=[a[1] for a in test_raw] #DO NOT have to turn labels to ints from sklearn import preprocessing le=preprocessing.LabelEncoder()
return d # load or create the character encoding dictionaries if os.path.exists(char_idx_path): with open(char_idx_path, 'rb') as f: logger.info('Loading character encodings from "%s"' % char_idx_path) idx_to_char = pickle.load(f) char_to_idx = pickle.load(f) cat_enc = pickle.load(f) else: n_docs = len(reuters.fileids()) cat_enc = dict((x, i+1) for i, x in enumerate(set(reuters.categories()))) chars = set() for fid in reuters.fileids(): chars = chars.union(set(reuters.raw(fid).lower())) idx_to_char = dict((i, c) for i, c in enumerate(chars)) char_to_idx = dict((c, i) for i, c in enumerate(chars)) with open(char_idx_path, 'wb') as f: logger.info('Saving character encodings to "%s"' % char_idx_path) pickle.dump(idx_to_char, f) pickle.dump(char_to_idx, f) pickle.dump(cat_enc, f) if os.path.exists(reuters_enc_path): logging.info('Loading reuters encodings from "%s"' % reuters_enc_path) np_file = np.load(reuters_enc_path) cats = np_file['arr_0'] docs = np_file['arr_1']
nltk.download('reuters') nltk.download('punkt') google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz' doc2vec_model_location = 'model/doc2vec-model.bin' doc2vec_dimensions = 300 classifier_model_location = 'model/classifier-model.bin' doc2vec = Doc2Vec.load(doc2vec_model_location) # Convert the categories to one hot encoded categories labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()]) # Convert load the articles with their corresponding categories train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')] test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')] shuffle(train_articles) shuffle(test_articles) # Convert the articles to document vectors using the doc2vec model train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles] test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles] train_labels = labelBinarizer.transform([article['categories'] for article in train_articles]) test_labels = labelBinarizer.transform([article['categories'] for article in test_articles]) train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels) # Initialize the neural network model = Sequential() model.add(Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu')) model.add(Dropout(0.3))
#!/usr/bin/python #coding:utf-8 # 2013/03/01 # トークナイズしてから小文字化するのと, 全文小文字化してからトークナイズするのは同じか? from nltk.corpus import reuters #from nlp.clustering.preprocess import preprocess import nlp.clustering.preprocess.preprocess as preprocess raw=reuters.raw(fileids=[reuters.fileids()[1]]) docs1 = preprocess.tokenize(raw) docs1 = preprocess.lower(docs1) docs2 = preprocess.tokenize(preprocess.lower(raw)) docs1 == docs2 raws=[reuters.raw(fileids=[fid]) for fid in reuters.fileids()] docs1 = preprocess.tokenize(raws) docs1 = preprocess.lower(docs1) docs2 = preprocess.tokenize(preprocess.lower(raws)) docs1 == docs2 docs3 = preprocess.word_tokenize(preprocess.lower(preprocess.sent_tokenize(raws))) docs1 == docs3 docs2 == docs3 import timeit setup=''' from nltk.corpus import reuters
def load_reuters(setName): html = HTMLParser.HTMLParser() doc_ids = reuters.fileids() cat2all_ids = {} cat2train_ids = {} cat2test_ids = {} cat2all_num = {} cand_docNum = 0 for doc_id in doc_ids: # only choose docs belonging in one category if len( reuters.categories(doc_id) ) == 1: cat = reuters.categories(doc_id)[0] cand_docNum += 1 if doc_id.startswith("train"): cat2set_ids = cat2train_ids else: cat2set_ids = cat2test_ids if cat in cat2set_ids: cat2set_ids[cat].append(doc_id) else: cat2set_ids[cat] = [ doc_id ] # both train and test doc_ids are put in cat2all_ids if cat in cat2all_ids: cat2all_ids[cat].append(doc_id) else: cat2all_ids[cat] = [ doc_id ] if cat in cat2all_num: cat2all_num[cat] += 1 else: cat2all_num[cat] = 1 print "Totally %d docs, %d single-category docs in %d categories" %( len(doc_ids), cand_docNum, len(cat2train_ids) ) sorted_cats = sorted( cat2all_num.keys(), key=lambda cat: cat2all_num[cat], reverse=True ) catNum = 10 cats_docsWords = [ [] for i in xrange(catNum) ] cats_docNames = [ [] for i in xrange(catNum) ] topN_cats = sorted_cats[:catNum] print "Top 10 categories:" keptAllDocNum = 0 keptTrainDocNum = 0 keptTestDocNum = 0 for cat in topN_cats: print "%s: %d/%d" %( cat, len(cat2train_ids[cat]), len(cat2test_ids[cat]) ) keptTrainDocNum += len(cat2train_ids[cat]) keptTestDocNum += len(cat2test_ids[cat]) keptAllDocNum += len(cat2train_ids[cat]) + len(cat2test_ids[cat]) print "Totally %d docs kept, %d in train, %d in test" %( keptAllDocNum, keptTrainDocNum, keptTestDocNum ) if setName == "train": cat2set_ids = cat2train_ids setDocNum = keptTrainDocNum elif setName == "test": cat2set_ids = cat2test_ids setDocNum = keptTestDocNum elif setName == "all": cat2set_ids = cat2all_ids setDocNum = keptAllDocNum else: raise Exception("Unknown set name %s" %setName) orig_docs_name = [] orig_docs_cat = [] orig_docs_words = [] readDocNum = 0 totalLineNum = 0 emptyFileNum = 0 for cat_id, cat in enumerate(topN_cats): for doc_id in cat2set_ids[cat]: if readDocNum % 50 == 49 or readDocNum == setDocNum - 1: print "\r%d %d\r" %( readDocNum + 1, totalLineNum ), text = html.unescape( reuters.raw(doc_id) ) text = text.encode("utf-8") lines = text.split("\n") if len(text) == 0 or len(lines) == 0: emptyFileNum += 1 continue readDocNum += 1 totalLineNum += len(lines) text = " ".join(lines) wordsInSentences, wc = extractSentenceWords(text) filename = doc_id orig_docs_words.append( wordsInSentences ) orig_docs_name.append(filename) orig_docs_cat.append(cat_id) cats_docsWords[cat_id].append(wordsInSentences) cats_docNames[cat_id].append(filename) print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum) return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \ cats_docsWords, cats_docNames, topN_cats
def getWordsFromReutersDoc(self, doc): return self.getWords(reuters.raw(doc))
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews from topia.termextract import extract extractor = extract.TermExtractor() with open('./corpus/all3.txt', 'r') as f: with open('./data/terms.txt', 'w') as o: o.write("Term\tOccurences\tStrength\n") for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()): o.write("\t".join(map(str, term)) + "\n")
from nltk.corpus import inaugural, reuters, brown, gutenberg from itertools import product as iter_product def words(text): return re.findall('[a-z]+', text.lower()) def train(features): model = collections.defaultdict(lambda: 1) for f in features: model[f] += 1 return model NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw())) alphabet = 'abcdefghijklmnopqrstuvwxyz' def edits1(word): splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [a + b[1:] for a, b in splits if b] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] inserts = [a + c + b for a, b in splits for c in alphabet] return set(deletes + transposes + replaces + inserts) def known_edits2(word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def reutersdocs(doc_index): """ Generator of docs from corpus reuters.""" for doc_type_id in reuters.fileids(): _, id = doc_type_id.split('/') doc_index.append(id) yield reuters.raw(doc_type_id)
def get_tf_idf(self,sample_doc,corpus_list): for tip in corpus_list: tokens = nltk.word_tokenize(reuters.raw(tip)) bi_tokens = nltk.bigrams(tokens) tri_tokens = nltk.trigrams(tokens) tokens = [token.lower() for token in tokens if len(token) > 2] tokens = [token for token in tokens if token not in self.stopwords] bi_tokens = [' '.join(token).lower() for token in bi_tokens] bi_tokens = [token for token in bi_tokens if token not in self.stopwords] tri_tokens = [' '.join(token).lower() for token in tri_tokens] tri_tokens = [token for token in tri_tokens if token not in self.stopwords] final_tokens = [] final_tokens.extend(tokens) final_tokens.extend(bi_tokens) final_tokens.extend(tri_tokens) self.docs[tip] = {'freq': {}, 'tf': {}, 'idf': {}, 'tf-idf': {}, 'tokens': []} for token in final_tokens: #The frequency computed for each tip self.docs[tip]['freq'][token] = self.freq(token, final_tokens) #The term-frequency (Normalized Frequency) self.docs[tip]['tf'][token] = self.tf(token, final_tokens) self.docs[tip]['tokens'] = final_tokens self.vocabulary.append(final_tokens) print 'vocabulary size is {0}'.format(len(self.vocabulary)); #print 'haha' #print self.vocabulary x = 0; for doc in self.docs: for token in self.docs[doc]['tf']: #The Inverse-Document-Frequency self.docs[doc]['idf'][token] = self.idf(token, self.vocabulary) #The tf-idf self.docs[doc]['tf-idf'][token] = self.tf_idf(token, self.docs[doc]['tokens'], self.vocabulary) #x = x + 1 print 'Current iteration is {0}'.format(x) words = {}; for doc in self.docs: for token in self.docs[doc]['tf-idf']: if token not in words: words[token] = self.docs[doc]['tf-idf'][token] else: if self.docs[doc]['tf-idf'][token] > words[token]: words[token] = self.docs[doc]['tf-idf'][token] for token in self.docs[doc]['tf-idf']: print token, self.docs[doc]['tf-idf'][token] for item in sorted(words.items(), key=lambda x: x[1], reverse=True): print "%f <= %s" % (item[1], item[0])
from nltk import word_tokenize from nltk.corpus import reuters from sklearn.decomposition import PCA from sklearn.manifold import TSNE nltk.download('reuters') nltk.download('punkt') google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz' doc2vec_model_location = 'model/doc2vec-model.bin' doc2vec_vectors_location = 'model/doc2vec-vectors.bin' doc2vec_dimensions = 300 doc2vec = Doc2Vec.load(doc2vec_model_location) jobs = [{'category': 'jobs', 'vec': doc2vec.infer_vector(word_tokenize(reuters.raw(fileId)))} for fileId in reuters.fileids(['jobs'])] trade = [{'category': 'trade', 'vec': doc2vec.infer_vector(word_tokenize(reuters.raw(fileId)))} for fileId in reuters.fileids(['trade'])[:500]] docs = [doc for doc in itertools.chain(jobs, trade)] pca = PCA(n_components=50) fiftyDimVecs = pca.fit_transform([doc['vec'] for doc in docs]) tsne = TSNE(n_components=2) twoDimVecs = tsne.fit_transform(fiftyDimVecs) fig, ax = plt.subplots() for doc, twoDimVec in zip(docs, twoDimVecs): ax.scatter(twoDimVec[0], twoDimVec[1], color=('r' if doc['category'] == 'jobs' else 'b')) plt.show()
def load_data(padding=0, sent_len=300, w2i=None): """ threshold = 0 all labels in test data threshold = 1 only multilabels in test data """ threshold = 1 train_docs, train_cats, test_docs, test_cats = [], [], [], [] popular_topics = set(['earn','acq','money-fx','grain','crude','trade','interest','ship','wheat','corn']) for doc_id in reuters.fileids(): if doc_id.startswith("train"): if set(reuters.categories(doc_id)).issubset(popular_topics): train_docs.append(reuters.raw(doc_id)) train_cats.append([cat for cat in reuters.categories(doc_id)]) # train_cats.append( # [cats.index(cat) for cat in reuters.categories(doc_id)]) else: if set(reuters.categories(doc_id)).issubset(popular_topics): test_docs.append(reuters.raw(doc_id)) test_cats.append([cat for cat in reuters.categories(doc_id)]) dataset = train_docs + test_docs max_sent_len, word_to_idx = get_vocab(dataset) if sent_len > 0: max_sent_len = sent_len if w2i is not None: word_to_idx = w2i train, train_label, test, test_label = [], [], [], [] for i, line in enumerate(train_docs): words = line_to_words(line) y = train_cats[i] if len(y) > 1: # The examples which contain at least 1 label would be assigned to test data. test_docs.append(line) test_cats.append(y) continue y = y[0] sent = [word_to_idx[word] for word in words if word in word_to_idx] if len(sent) > max_sent_len: sent = sent[:max_sent_len] else: sent.extend([0] * (max_sent_len + padding - len(sent))) train.append(sent) train_label.append(y) single_label = ['-1'] + list(set(train_label)) num_classes = len(single_label) for i, l in enumerate(train_label): train_label[i] = single_label.index(l) for i, line in enumerate(test_docs): words = line_to_words(line) y = test_cats[i] sent = [word_to_idx[word] for word in words if word in word_to_idx] if len(sent) > max_sent_len: sent = sent[:max_sent_len] else: sent.extend([0] * (max_sent_len + padding - len(sent))) if len(y) > threshold and set(y).issubset(single_label): test.append(sent) one_hot_y = np.zeros([num_classes],dtype=np.int32) for yi in y: one_hot_y[single_label.index(yi)]=1 test_label.append(one_hot_y) return single_label, word_to_idx, np.array(train), np.array(train_label), np.array(test), np.array(test_label)