] cate2 = [ "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware", "comp.windows.x" ] twenty_train = fetch_20newsgroups(subset="train", categories=cate2, shuffle=True) twenty_test = fetch_20newsgroups(subset="test", categories=cate2, shuffle=True) #cleaninng data set truck_cleaner = Cleaner() truck_cleaner.get_data_category_count(twenty_train) cleaner_text = truck_cleaner.text_header_remover(twenty_train.data) #preparing dataset import nltk #nltk.download('punkt') #nltk.download('stopwords') from gensim.models import Word2Vec from nltk.corpus import stopwords #import numpy as np def tokenizer_helper(cleaner_text_list): tokenize_sentences_list = [] for sentence in cleaner_text_list: tokenize_sentences_list.append(nltk.sent_tokenize(sentence)) return tokenize_sentences_list
count = count +1 else: break; print("count: " + str(count)) text = text[count:] text = " ".join(text) #text = re.sub("([^a-zA-Z0-9\.]+)"," ",text) #text = re.sub("(\w*)([0-9]+)(\w*)"," ",text) #print(text) from Cleaner import Cleaner truck_cleaner = Cleaner() cleaner_text =truck_cleaner.text_header_remover([c_text]) #print(cleaner_text) print( text == cleaner_text[0]) #print(text) #print(cleaner_text[0]) for x in clean_tokenized_text_list: for i in range (len(x)): x[i] = [word for word in x[i] if word not in stopwords.words('english')] break