Ejemplo n.º 1
0
 def __preprocess(self, text):
     text = TidenePreProcess.TokenizeFromList(self.tokenizer, [text])
     if self.stop_set is not None:
         text = TidenePreProcess.CleanStopWords(self.stop_set).clean(text)
     for t in text:
         if self.stemmer is not None:
             t = [self.stemmer.stem(word) for word in t]
         return t
Ejemplo n.º 2
0
 def clean(self, text):
     text = TidenePreProcess.Tokenize(self.tokenizer).tokenize(text)
     if self.stop_set is not None:
         text = TidenePreProcess.CleanStopWords(self.stop_set).clean(text)
     if self.stemmer is not None:
         text = [self.stemmer.stem(word) for word in text]
     if self.remove_digits is True:
         text = [word for word in text if not word.isdigit()]
     return text
Ejemplo n.º 3
0
 def __iter__(self):
     corpus = GetFilesFromPath(self.path_list)
     for data in corpus:
         text = TidenePreProcess.CleanStopWords(self.stop_set, TidenePreProcess.TokenizeFromList(self.tokenizer, [data[1]]))
         # text = TidenePreProcess.TokenizeFromList(self.tokenizer, [data[1]])
         for t in text:
             # print(t)
             t = [self.stemmer.stem(word) for word in t]
             # print(t)
             # exit(0)
             yield doc2vec.TaggedDocument(t, [data[0]])
Ejemplo n.º 4
0
 def __iter__(self):
     self.corpus = GetFilesFromPath(self.path_dict)
     for data in self.corpus:
         text = TidenePreProcess.Tokenize(self.tokenizer).clean(data[1])
         if self.stop_set is not None:
             text = TidenePreProcess.CleanStopWords(
                 self.stop_set).clean(text)
         for t in text:
             if self.stemmer is not None:
                 t = [self.stemmer.stem(word) for word in t]
             yield t
Ejemplo n.º 5
0
	def fit(self, X, y):
		tokenFunction = TidenePreProcess.TokenizeFromStreamFunction(self.tokenizer).get  # tokenize
		tfidf = TfidfVectorizer(tokenizer= tokenFunction, ngram_range=(1,3), min_df = 0, stop_words=self.stopSet)
		tfidf.fit(X,y)
		max_idf = max(tfidf.idf_)
		self.word2weight = defaultdict(lambda: max_idf,[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
		return self
Ejemplo n.º 6
0
	def getVec(tokenizer,stopSet):
		tokenFunction = TidenePreProcess.TokenizeFromStreamFunction(tokenizer).get  # tokenize
		vectorizer = TfidfVectorizer(tokenizer= tokenFunction, ngram_range=(1,3), min_df = 0, stop_words=stopSet)
		transformer = TfidfTransformer()
		return(vectorizer, transformer)
Ejemplo n.º 7
0
#	print(text)
'''
Structuring patents on a dataframe
'''
dataframe = TideneMisc.LstToDataFrame.transform(
    corpus)  # return a data frame with data and target/cats
catNames = list(set(dataframe.target))  # categories/classes
#print("CatNames:",catNames)
#print (dataframe.data) # 0  azo corantes escarlates reativos a fibra azo c...
#print (dataframe.target) # 0     C07D

# preprocess docs and fils LabeledSenceStructure
dataset = []
for index, doc in enumerate(dataframe.data):
    preprocDoc = TidenePreProcess.CleanStopWords(
        stopSet, TidenePreProcess.TokenizeFromList(tokenizer,
                                                   [doc]))  #returns a list
    for p in preprocDoc:
        dataset.append(LabeledSentence(words=p, tags=[dataframe.target[index]
                                                      ]))  # tags = list

# split modelo into training and tes
print(" Gensim 80-20 training and testing \n")
splitRateTrainPerc = 80
splitRateTestPerc = 20
randomInt = 42
trainDocs, testDocs = train_test_split(dataset,
                                       test_size=(splitRateTestPerc / 100),
                                       train_size=(splitRateTrainPerc / 100),
                                       random_state=randomInt)