Example #1
0
def load_data(dirFolder, testRatio, featureKeepRatio=1.0):
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i: classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        splitPoint = int(testRatio * len(documents))
        trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint]
        allDocs.append([trainDocs, testDocs])
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs,
                      featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X_train, Y_train = [], []
    X_test, Y_test = [], []
    for i, dclass in enumerate(classes):
        for j in range(len(allDocs[i])):
            for doc in allDocs[i][j]:
                processedFile = preprocess.readFile(
                    os.path.join(os.path.join(dirFolder, dclass), doc))
                words = Counter(processedFile)
                features = [words.get(w, 0) for w in vocabulary]
                if j == 0:
                    X_train.append(features)
                    Y_train.append(i)
                else:
                    X_test.append(features)
                    Y_test.append(i)
    return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
Example #2
0
def load_data_word2vec(dirFolder, featureKeepRatio=1.0):
    model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i:classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        allDocs.append(documents)
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X = []
    def getIt(dablu):
        try:
            return model[dablu]
        except:
            return np.zeros((300,))

    for i, dclass in enumerate(classes):
        for doc in allDocs[i]:
            processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc))
            words = list(set(processedFile))
            features = [ getIt(w) for w in vocabulary]
            X.append(features)
    return np.stack(X)
Example #3
0
 def init_all_states(self):
     self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config)
     self.tf_idf = TfIdf(self.config)
     # TODO: wait for models
     self.cluster_model = joblib.load(self.cluster_md)
     self.vec_model = Doc2Vec.load(self.vec_md)
     jieba.initialize()
Example #4
0
 def init_all_states(self):
     self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config)
     self.tf_idf = TfIdf(self.config)
     self.cluster_model = joblib.load(self.cluster_md)
     # self.vec_model = Doc2Vec.load(self.vec_md)
     # self.vec_model = BertClient()
     self.load_stop_words(self.config)
     jieba.initialize()
Example #5
0
def load_data(dirFolder, featureKeepRatio=1.0):
    classes = sorted(os.listdir(dirFolder))
    vocabulary = set()
    cMap = {i:classes[i] for i in range(len(classes))}
    allDocs = []
    for i, dclass in enumerate(classes):
        documents = os.listdir(os.path.join(dirFolder, dclass))
        np.random.shuffle(documents)
        allDocs.append(documents)
        # Process documents for vocabulary selection
        tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio)
        selectedWords = tfidf.selectWords()
        vocabulary = vocabulary | selectedWords
    # Featurize data according to above vocabulary
    vocabulary = list(vocabulary)
    X = []
    for i, dclass in enumerate(classes):
        for doc in allDocs[i]:
            processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc))
            words = Counter(processedFile)
            features = [ words.get(w, 0) for w in vocabulary]
            X.append(features)
    return np.stack(X)
Example #6
0
import datetime
from inverted_index import InvertedIndex
from tf_idf import TfIdf
import pandas as pd

custom_separator = "###"

if __name__ == '__main__':
    df=pd.read_csv('../data/stories.csv', sep=',',header=None)
    df_head = pd.read_csv('../data/db_books.csv', sep=',')
    
    array_content = df.values
    # array_segment = array_content[0:3]
    
    # INVERTED INDEX
    print("Init INVERTED INDEX")
    begin_time = datetime.datetime.now()
    invindex = InvertedIndex(array_content)
    invindex.process()
    print(datetime.datetime.now() - begin_time)   
    invindex.saveIndexInDisc()
    
    # TF IDF
    print("Init TF IDF")
    begin_time = datetime.datetime.now()
    tf_idf = TfIdf(array_content)
    tf_idf.process()
    print(datetime.datetime.now() - begin_time)   
    tf_idf.saveIndexInDisc()
Example #7
0
 
 parser = PreProcess()
 parsed_trainning_documents = {}
 print('processing...')
 for k, v in reader.train.items():
   parsed_trainning_documents[k] = parser.process(v)
 
 # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
 # Receberá como entrada um array de tuplas: ([tokens], classe)
 parsed_trainning_documents_with_classes = []
 for k in parsed_trainning_documents.keys():
   parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
 
 # Execução tf-idf
 print('generating tf.idf...')
 tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
 tf_idf_calculator.run()
 
 # testa os parâmetros do knn: métrica de distância e valor de K
 for metric in ['cosine', 'euclid']:
   for k in range(5, 11, 2):
     knn = KNN(tf_idf_calculator.results, k, metric)
   
     # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B
     topics = ['baseball', 'christian', 'guns']
     confusion_matrix = {topic:{t:0 for t in topics} for topic in topics}
     
     print_log = False
     i = 0
     ytrue = []
     ypred = []
Example #8
0
import pandas as pd
from preprocessing import InputFrame
from tf_idf import TfIdf
from NMF import LatentFeatures
import cPickle as pk

filename = '../../StateNames.csv'
dataframe = pd.read_csv(filename)

#Run the male preprocessing and model
male_instance = InputFrame(dataframe, 'M')
male_instance.clean_data_from_frame()
male_frame = male_instance.gender_frame

male_tfidf = TfIdf(male_frame)
male_vectors = male_tfidf.tfidf_matrix()

male_nmf = LatentFeatures(male_vectors)
male_matrices = male_nmf.fit_model()

# male_nmf.print_W_H_features(male_tfidf.index, male_tfidf.features, 20)
male_features_dict = male_nmf.latent_features_dict(male_tfidf.index,
                                                   male_tfidf.features, 20)

#Store the male model and latent features dictionary
with open('../data/male_nmf.pkl', 'w') as f:
    pk.dump(male_nmf, f)

with open('../data/male_latent_features.pkl', 'w') as f:
    pk.dump(male_features_dict, f)