def load_idf_tfidf(self): # load idf idf = FileReader(settings.IDF_PATH).load_data() # load tf_idf of whole document tf_idf = FileReader(settings.TFIDF_PATH).load_data() return idf, tf_idf
class FeatureExtraction(object): def __init__(self, data): self.data = data def __build_dictionary(self): print('Building dictionary') dict_words = [] i = 0 for text in self.data: i += 1 print("Dictionary Step {} / {}".format(i, len(self.data))) words = NLP(text = text['content']).get_words_feature() dict_words.append(words) FileStore(filePath=settings.DICTIONARY_PATH).store_dictionary(dict_words) def __load_dictionary(self): if os.path.exists(settings.DICTIONARY_PATH) == False: self.__build_dictionary() self.dictionary = FileReader(settings.DICTIONARY_PATH).load_dictionary() def __build_dataset(self): print('Building dataset') self.features = [] self.labels = [] i = 0 for d in self.data: i += 1 print("Step {} / {}".format(i, len(self.data))) self.features.append(self.get_dense(d['content'])) self.labels.append(d['category']) def get_dense(self, text): #remove stopword words = NLP(text).get_words_feature() # Bag of words self.__load_dictionary() vec = self.dictionary.doc2bow(words) dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0]) return dense # def build_tfidf(self,text): def get_data_and_label_tfidf(self): print('Building dataset') self.features = [] self.labels = [] i = 0 for d in self.data: i += 1 print("Step {} / {}".format(i, len(self.data))) self.features.append(' '.join(NLP(d['content']).get_words_feature())) self.labels.append(d['category']) return self.features, self.labels def get_data_and_label_bow(self): self.__build_dataset() return self.features, self.labels def read_feature(self): return self.data['features'] , self.data['labels']
def load_corpus(self): corpus = [] nlp = NLP() for filename in os.listdir(self.data_path): text = FileReader(os.path.join(self.data_path, filename)).read() nlp.set_text(text) corpus.append(nlp.preprocess()) return corpus
def compute_tf_idf_query(query_path, dictionary, idf): query_text = FileReader(query_path).read() # print(len(query_text.split())) # init TFIDF with dictionary and idf value tfidf = TFIDF(dictionary=dictionary, idf=idf) # compute query tf_idf tf_idf_query = tfidf.compute_query_tf_idf(NLP(query_text).preprocess()) return tf_idf_query
def __load_dictionary(self): if os.path.exists(settings.DICTIONARY_PATH) == False: self.__build_dictionary() self.dictionary = FileReader( settings.DICTIONARY_PATH).load_dictionary()
def __set_stopwords(self): self.stopwords = FileReader(settings.STOP_WORDS).read_stopwords()
def __set_synonym(self): self.synonym = FileReader(settings.SYNONYM_PATH).load_synonym()
def load_dictionary(self): return FileReader(settings.DICTIONARY_PATH).load_dict_data()