def load_idf_tfidf(self):

        # load idf
        idf = FileReader(settings.IDF_PATH).load_data()
        # load tf_idf of whole document
        tf_idf = FileReader(settings.TFIDF_PATH).load_data()

        return idf, tf_idf
class FeatureExtraction(object):
    def __init__(self, data):
        self.data = data

    def __build_dictionary(self):
        print('Building dictionary')
        dict_words = []
        i = 0
        for text in self.data:
            i += 1
            print("Dictionary Step {} / {}".format(i, len(self.data)))
            words = NLP(text = text['content']).get_words_feature()
            dict_words.append(words)
        FileStore(filePath=settings.DICTIONARY_PATH).store_dictionary(dict_words)

    def __load_dictionary(self):
        if os.path.exists(settings.DICTIONARY_PATH) == False:
            self.__build_dictionary()
        self.dictionary = FileReader(settings.DICTIONARY_PATH).load_dictionary()

    def __build_dataset(self):
        print('Building dataset')
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
            print("Step {} / {}".format(i, len(self.data)))
            self.features.append(self.get_dense(d['content']))
            self.labels.append(d['category'])

    def get_dense(self, text):
        #remove stopword
        words = NLP(text).get_words_feature()
        # Bag of words
        self.__load_dictionary()
        vec = self.dictionary.doc2bow(words)
        dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
        return dense

    # def build_tfidf(self,text):
        
    def get_data_and_label_tfidf(self):
        print('Building dataset')
        self.features = []
        self.labels = []
        i = 0
        for d in self.data:
            i += 1
            print("Step {} / {}".format(i, len(self.data)))
            self.features.append(' '.join(NLP(d['content']).get_words_feature()))
            self.labels.append(d['category'])        
        return self.features, self.labels
    
    def get_data_and_label_bow(self):
        self.__build_dataset()
        return self.features, self.labels

    def read_feature(self):
        return self.data['features'] , self.data['labels']
 def load_corpus(self):
     corpus = []
     nlp = NLP()
     for filename in os.listdir(self.data_path):
         text = FileReader(os.path.join(self.data_path, filename)).read()
         nlp.set_text(text)
         corpus.append(nlp.preprocess())
     return corpus
Exemple #4
0
def compute_tf_idf_query(query_path, dictionary, idf):

    query_text = FileReader(query_path).read()
    # print(len(query_text.split()))
    # init TFIDF with dictionary and idf value
    tfidf = TFIDF(dictionary=dictionary, idf=idf)
    # compute query tf_idf
    tf_idf_query = tfidf.compute_query_tf_idf(NLP(query_text).preprocess())

    return tf_idf_query
 def __load_dictionary(self):
     if os.path.exists(settings.DICTIONARY_PATH) == False:
         self.__build_dictionary()
     self.dictionary = FileReader(
         settings.DICTIONARY_PATH).load_dictionary()
 def __set_stopwords(self):
     self.stopwords = FileReader(settings.STOP_WORDS).read_stopwords()
 def __set_synonym(self):
     self.synonym = FileReader(settings.SYNONYM_PATH).load_synonym()
    def load_dictionary(self):

        return FileReader(settings.DICTIONARY_PATH).load_dict_data()