예제 #1
0
    def __setup(self):

        utl = Utils()
        stemmer = OrengoStemmer()
        stemmer.enableCaching(1000)

        input_file = open("Data_sets/"+self.__file_name+".csv", "r")
        annotation = []
        annotation_int = []

        for line in input_file:
            vec = line.split(';')
            annotation.append(vec[1].replace('\n',''))
            annotation_int.append(int(vec[1].replace('\n','')))
            vec[0] = vec[0].lower()
            vec[0] = utl.remove_marks(vec[0])
            vec[0] = utl.replace_mentions(vec[0])
            vec[0] = utl.delete_links(vec[0])
            phrase = ''
            for elem in vec[0].split(' '):
                if(self.__lang == 'en'):
                    elem = stem(elem)
                if(self.__lang == 'pt'):
                    elem = stemmer(elem)
                phrase = phrase+' '+elem
            self.__corpus.append(phrase.replace('\n',''))

        self.__number_of_examples = len(self.__corpus)
        transform = self.__vectorizer.fit_transform(self.__corpus)
        feature_list = self.__vectorizer.get_feature_names()

        transform_binary = self.__vectorizer_binary.fit_transform(self.__corpus)

        self.__feature_vector_len = len(feature_list)
        self.__X = self.__vectorizer.transform(self.__corpus)
        self.__X_binary = self.__vectorizer_binary.transform(self.__corpus).toarray().tolist()
        self.__y = annotation
        self.__y_int = annotation_int
        self.__set_of_classes = set(annotation)
예제 #2
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from Preproc import Utils
#from stemming.porter2 import stem
from sklearn import cross_validation

utl = Utils()
corpus = []
annotation = []

input_file = open("new_sts.csv", "r")

for line in input_file:
    vec = line.split(';')
    annotation.append(vec[1].replace('\n', ''))
    vec[0] = vec[0].lower()
    vec[0] = utl.remove_marks(vec[0])
    vec[0] = utl.replace_mentions(vec[0])
    vec[0] = utl.delete_links(vec[0])
    phrase = ''
    for elem in vec[0].split(' '):
        try:
            elem = stem(elem)
        except:
            pass
        phrase = phrase + ' ' + elem
    corpus.append(phrase.replace('\n', ''))

vectorizer = CountVectorizer(min_df=0.0, max_df=1.0)
number_of_examples = len(corpus)
transform = vectorizer.fit_transform(corpus)
예제 #3
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from Preproc import Utils
#from stemming.porter2 import stem
from sklearn import cross_validation

utl = Utils()
corpus = []
annotation = []

input_file = open("new_sts.csv","r")

for line in input_file:
    vec = line.split(';')
    annotation.append(vec[1].replace('\n',''))
    vec[0] = vec[0].lower()
    vec[0] = utl.remove_marks(vec[0])
    vec[0] = utl.replace_mentions(vec[0])
    vec[0] = utl.delete_links(vec[0])
    phrase = ''
    for elem in vec[0].split(' '):
        try:
            elem = stem(elem)
        except:
            pass
        phrase = phrase+' '+elem
    corpus.append(phrase.replace('\n',''))

vectorizer = CountVectorizer(min_df = 0.0, max_df = 1.0)
number_of_examples = len(corpus)
transform = vectorizer.fit_transform(corpus)
예제 #4
0
import os
from stemming.porter2 import stem
from Preproc import Utils

utl = Utils()
corpus = []
annotation = []
i = 0
for filename in os.listdir('../dataset/tokens/neg/'):
    arquivo = open("../datasets/tokens/neg/"+filename, "r")
    for line in arquivo:
        annotation.append("-1")
        line = line.lower().replace(","," ").replace(";"," ")
        line = utl.delete_links(line)
        line = utl.remove_marks(line)
        line = utl.remove_punctuation(line)
        phrase = ''
        for elem in line.split(" "):
                try:
                    elem = stem(elem)
                    phrase = phrase+' '+elem.encode("utf8")
                except:
                    i += 1
        corpus.append(phrase.replace('\n',''))

for filename in os.listdir('../datasets/tokens/pos/'):
    arquivo = open("../dataset/tokens/pos/"+filename, "r")
    for line in arquivo:
        annotation.append("1")
        line = line.lower().replace(","," ").replace(";"," ")
        line = utl.delete_links(line)