"alt.atheism", "soc.religion.christian", "sci.med", "comp.graphics" ] cate2 = [ "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware", "comp.windows.x" ] twenty_train = fetch_20newsgroups(subset="train", categories=cate2, shuffle=True) twenty_test = fetch_20newsgroups(subset="test", categories=cate2, shuffle=True) #cleaninng data set truck_cleaner = Cleaner() truck_cleaner.get_data_category_count(twenty_train) cleaner_text = truck_cleaner.text_header_remover(twenty_train.data) #preparing dataset import nltk #nltk.download('punkt') #nltk.download('stopwords') from gensim.models import Word2Vec from nltk.corpus import stopwords #import numpy as np def tokenizer_helper(cleaner_text_list): tokenize_sentences_list = [] for sentence in cleaner_text_list: tokenize_sentences_list.append(nltk.sent_tokenize(sentence))