def previous_main():
    DEBUGMODE = 0
    numFeatures = 100

    path_dataset_dav_windows = 'Dati/training_set_text.csv'
    path_class_csv = 'Dati/training_set_features.csv'
    path_model_file = 'Dati/model.dat'

    cleaner = TweetsCleaner.TweetsCleaner()
    loader = DatasetLoader.DatasetLoader()
    model = VectorModel.VectorModel()
    classificator = BayesanClassificator.BayesanClassificator()
    evaluator = ClassifierEvaluation.ClassifierEvaluation()

    tweets_dataset = loader.LoadTweets(path_dataset_dav_windows)
    tweets_cleaned = cleaner.ProcessDatasetDict(tweets_dataset)
    features_dataset = loader.LoadFeatures(path_class_csv, 400)
    """
        Trasforma il vettore delle features in un dizionario con chiave IdDoc e valore la classe corrispondente
        (1 : neutra, 2: positiva, 3: negativa, 4: mista
    """
    classes_dataset = loader.createClasses(features_dataset)
    """
        Genero il Modello TF-IDF
    """
    all_phrases = list(tweets_cleaned.values())[:400]

    count = 0
    phrases_tuples = []
    for phrase in all_phrases:
        phrases_tuples.append((count, phrase))
        count += 1

    if not DEBUGMODE or not os.path.exists(path_model_file):
        tfidf = model.get_tfidf(phrases_tuples)
        model.persist_tfidf(tfidf, path_model_file)
    else:
        tfidf = model.deserialize_tfidf(path_model_file)

    doc_index = model.get_doc_index(tfidf)

    # prendo le etichette delle classi per la gold solution
    labels = numpy.array(list(classes_dataset.values()))

    # applico LSA
    reduced = model.LSA(model.get_doc_index_table(doc_index), numFeatures)
    # scalo in [0,1]
    reduced = loader.NormalizeDataset(reduced)

    BayesTest(reduced, labels)
import TweetsCleaner
import VectorModel
import ClassifierEvaluation
import pickle

if __name__ == "__main__":

    DEBUGMODE = 1

    path_dataset_dav_windows = 'Dati/training_set_text.csv'
    path_class_csv = 'Dati/training_set_features.csv'
    path_model_file = 'Dati/model.dat'

    cleaner = TweetsCleaner.TweetsCleaner()
    loader = DatasetLoader.DatasetLoader()
    model = VectorModel.VectorModel()
    evaluator = ClassifierEvaluation.ClassifierEvaluation()

    tweets_dataset = loader.LoadTweets(path_dataset_dav_windows)
    tweets_cleaned = cleaner.ProcessDatasetDict(tweets_dataset)
    features_dataset = loader.LoadFeatures(path_class_csv)
    """
        Trasforma il vettore delle features in un dizionario con chiave IdDoc e valore la classe corrispondente
        (1 : neutra, 2: positiva, 3: negativa, 4: mista
    """
    classes_dataset = loader.createClasses(features_dataset)
    """
        Genero il Modello TF-IDF
    """
    all_phrases = list(tweets_cleaned.values())
Example #3
0
#!/usr/bin/env python
__author__ = "Tom Kocmi"

import logging
import VectorModel
import Cons
import generateRules
import new_fixes
import time

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
start = time.time()  # for counting the time

model = VectorModel.getVectorModel()  # this will load existing model
# in case if you want to generate new model, put True in the bracket
prefixes, suffixes = new_fixes.generateFixes(model.vocab.keys())
rules = GenerateRules.generate(prefixes, suffixes, model)
print rules
# generate prefixes and suffixes from the vocabulary of the model
# if the _fixes already exist it will load them from file instead of generating new ones.
# In case that you want to forse it to generate, put True as a second parameter

# experiments with the model, the words must be in the dictionary
# print model.most_similar(positive=['winston', 'love'], negative=['war'])
# print model.doesnt_match("winston julia brother goldstein".split())
# print model.similarity("winston", "julia")

print "Time: " + str(time.time() - start)
Example #4
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = "Tom Kocmi"

import logging
import VectorModel
import Cons, Fixes, GenerateRules
import time
import pickle
import Queue

start = time.time()  # for counting the time
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

model = VectorModel.getVectorModel()

# vocabulary = Fixes.downsampleVocabulary(model, Cons.MAXWORDS4AFFIXES)
# with open("models/vocabulary2.data", 'w') as f:
#     pickle.dump(vocabulary, f)
# with open("models/vocabulary.data", 'r') as f:
#      vocabulary = pickle.load(f)


prefixes, suffixes = Fixes.generateFixes(vocabulary)



# rules = GenerateRules.generate(prefixes, suffixes, model, vocabulary)
# with open("models/rules6.data", 'w') as f:
#     pickle.dump(rules, f)