Esempio n. 1
0
]

cate2 = [
    "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware", "comp.windows.x"
]

twenty_train = fetch_20newsgroups(subset="train",
                                  categories=cate2,
                                  shuffle=True)
twenty_test = fetch_20newsgroups(subset="test", categories=cate2, shuffle=True)

#cleaninng data set
truck_cleaner = Cleaner()
truck_cleaner.get_data_category_count(twenty_train)
cleaner_text = truck_cleaner.text_header_remover(twenty_train.data)

#preparing dataset
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from gensim.models import Word2Vec
from nltk.corpus import stopwords
#import numpy as np


def tokenizer_helper(cleaner_text_list):
    tokenize_sentences_list = []
    for sentence in cleaner_text_list:
        tokenize_sentences_list.append(nltk.sent_tokenize(sentence))
    return tokenize_sentences_list
        count = count +1
        
    else:
        break;

print("count: " + str(count))


text = text[count:]
text = " ".join(text)
#text = re.sub("([^a-zA-Z0-9\.]+)"," ",text)
#text = re.sub("(\w*)([0-9]+)(\w*)"," ",text)
#print(text)
from Cleaner import Cleaner
truck_cleaner = Cleaner()
cleaner_text =truck_cleaner.text_header_remover([c_text])
#print(cleaner_text)
print( text == cleaner_text[0])

#print(text)
#print(cleaner_text[0])


    for x in clean_tokenized_text_list:  
        for i in range (len(x)):
            x[i] = [word for word in x[i] if word not in stopwords.words('english')]
        break