def load_data(dirFolder, testRatio, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i: classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) splitPoint = int(testRatio * len(documents)) trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint] allDocs.append([trainDocs, testDocs]) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X_train, Y_train = [], [] X_test, Y_test = [], [] for i, dclass in enumerate(classes): for j in range(len(allDocs[i])): for doc in allDocs[i][j]: processedFile = preprocess.readFile( os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [words.get(w, 0) for w in vocabulary] if j == 0: X_train.append(features) Y_train.append(i) else: X_test.append(features) Y_test.append(i) return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
def load_data_word2vec(dirFolder, featureKeepRatio=1.0): model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] def getIt(dablu): try: return model[dablu] except: return np.zeros((300,)) for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = list(set(processedFile)) features = [ getIt(w) for w in vocabulary] X.append(features) return np.stack(X)
def init_all_states(self): self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config) self.tf_idf = TfIdf(self.config) # TODO: wait for models self.cluster_model = joblib.load(self.cluster_md) self.vec_model = Doc2Vec.load(self.vec_md) jieba.initialize()
def init_all_states(self): self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config) self.tf_idf = TfIdf(self.config) self.cluster_model = joblib.load(self.cluster_md) # self.vec_model = Doc2Vec.load(self.vec_md) # self.vec_model = BertClient() self.load_stop_words(self.config) jieba.initialize()
def load_data(dirFolder, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [ words.get(w, 0) for w in vocabulary] X.append(features) return np.stack(X)
import datetime from inverted_index import InvertedIndex from tf_idf import TfIdf import pandas as pd custom_separator = "###" if __name__ == '__main__': df=pd.read_csv('../data/stories.csv', sep=',',header=None) df_head = pd.read_csv('../data/db_books.csv', sep=',') array_content = df.values # array_segment = array_content[0:3] # INVERTED INDEX print("Init INVERTED INDEX") begin_time = datetime.datetime.now() invindex = InvertedIndex(array_content) invindex.process() print(datetime.datetime.now() - begin_time) invindex.saveIndexInDisc() # TF IDF print("Init TF IDF") begin_time = datetime.datetime.now() tf_idf = TfIdf(array_content) tf_idf.process() print(datetime.datetime.now() - begin_time) tf_idf.saveIndexInDisc()
parser = PreProcess() parsed_trainning_documents = {} print('processing...') for k, v in reader.train.items(): parsed_trainning_documents[k] = parser.process(v) # Entrada para o tf-idf, devemos anotar os documentos com suas classes. # Receberá como entrada um array de tuplas: ([tokens], classe) parsed_trainning_documents_with_classes = [] for k in parsed_trainning_documents.keys(): parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]] # Execução tf-idf print('generating tf.idf...') tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes) tf_idf_calculator.run() # testa os parâmetros do knn: métrica de distância e valor de K for metric in ['cosine', 'euclid']: for k in range(5, 11, 2): knn = KNN(tf_idf_calculator.results, k, metric) # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B topics = ['baseball', 'christian', 'guns'] confusion_matrix = {topic:{t:0 for t in topics} for topic in topics} print_log = False i = 0 ytrue = [] ypred = []
import pandas as pd from preprocessing import InputFrame from tf_idf import TfIdf from NMF import LatentFeatures import cPickle as pk filename = '../../StateNames.csv' dataframe = pd.read_csv(filename) #Run the male preprocessing and model male_instance = InputFrame(dataframe, 'M') male_instance.clean_data_from_frame() male_frame = male_instance.gender_frame male_tfidf = TfIdf(male_frame) male_vectors = male_tfidf.tfidf_matrix() male_nmf = LatentFeatures(male_vectors) male_matrices = male_nmf.fit_model() # male_nmf.print_W_H_features(male_tfidf.index, male_tfidf.features, 20) male_features_dict = male_nmf.latent_features_dict(male_tfidf.index, male_tfidf.features, 20) #Store the male model and latent features dictionary with open('../data/male_nmf.pkl', 'w') as f: pk.dump(male_nmf, f) with open('../data/male_latent_features.pkl', 'w') as f: pk.dump(male_features_dict, f)