def __setup(self): utl = Utils() stemmer = OrengoStemmer() stemmer.enableCaching(1000) input_file = open("Data_sets/"+self.__file_name+".csv", "r") annotation = [] annotation_int = [] for line in input_file: vec = line.split(';') annotation.append(vec[1].replace('\n','')) annotation_int.append(int(vec[1].replace('\n',''))) vec[0] = vec[0].lower() vec[0] = utl.remove_marks(vec[0]) vec[0] = utl.replace_mentions(vec[0]) vec[0] = utl.delete_links(vec[0]) phrase = '' for elem in vec[0].split(' '): if(self.__lang == 'en'): elem = stem(elem) if(self.__lang == 'pt'): elem = stemmer(elem) phrase = phrase+' '+elem self.__corpus.append(phrase.replace('\n','')) self.__number_of_examples = len(self.__corpus) transform = self.__vectorizer.fit_transform(self.__corpus) feature_list = self.__vectorizer.get_feature_names() transform_binary = self.__vectorizer_binary.fit_transform(self.__corpus) self.__feature_vector_len = len(feature_list) self.__X = self.__vectorizer.transform(self.__corpus) self.__X_binary = self.__vectorizer_binary.transform(self.__corpus).toarray().tolist() self.__y = annotation self.__y_int = annotation_int self.__set_of_classes = set(annotation)
from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from Preproc import Utils #from stemming.porter2 import stem from sklearn import cross_validation utl = Utils() corpus = [] annotation = [] input_file = open("new_sts.csv", "r") for line in input_file: vec = line.split(';') annotation.append(vec[1].replace('\n', '')) vec[0] = vec[0].lower() vec[0] = utl.remove_marks(vec[0]) vec[0] = utl.replace_mentions(vec[0]) vec[0] = utl.delete_links(vec[0]) phrase = '' for elem in vec[0].split(' '): try: elem = stem(elem) except: pass phrase = phrase + ' ' + elem corpus.append(phrase.replace('\n', '')) vectorizer = CountVectorizer(min_df=0.0, max_df=1.0) number_of_examples = len(corpus) transform = vectorizer.fit_transform(corpus)
from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from Preproc import Utils #from stemming.porter2 import stem from sklearn import cross_validation utl = Utils() corpus = [] annotation = [] input_file = open("new_sts.csv","r") for line in input_file: vec = line.split(';') annotation.append(vec[1].replace('\n','')) vec[0] = vec[0].lower() vec[0] = utl.remove_marks(vec[0]) vec[0] = utl.replace_mentions(vec[0]) vec[0] = utl.delete_links(vec[0]) phrase = '' for elem in vec[0].split(' '): try: elem = stem(elem) except: pass phrase = phrase+' '+elem corpus.append(phrase.replace('\n','')) vectorizer = CountVectorizer(min_df = 0.0, max_df = 1.0) number_of_examples = len(corpus) transform = vectorizer.fit_transform(corpus)
import os from stemming.porter2 import stem from Preproc import Utils utl = Utils() corpus = [] annotation = [] i = 0 for filename in os.listdir('../dataset/tokens/neg/'): arquivo = open("../datasets/tokens/neg/"+filename, "r") for line in arquivo: annotation.append("-1") line = line.lower().replace(","," ").replace(";"," ") line = utl.delete_links(line) line = utl.remove_marks(line) line = utl.remove_punctuation(line) phrase = '' for elem in line.split(" "): try: elem = stem(elem) phrase = phrase+' '+elem.encode("utf8") except: i += 1 corpus.append(phrase.replace('\n','')) for filename in os.listdir('../datasets/tokens/pos/'): arquivo = open("../dataset/tokens/pos/"+filename, "r") for line in arquivo: annotation.append("1") line = line.lower().replace(","," ").replace(";"," ") line = utl.delete_links(line)