from lib import helpers helpers.printCurrentTime("start ./amazon_statistics.py") import logging logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # CONFIG from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') amazon_dump_dir = config.get_string('amazon-dump.dir') movie_reviews_file = config.get_string('amazon-dump.files.reviews.movies') text_min_len = config.get_int('amazon-dump.statistics.review.text.min-len') statistic_measures_cache_dir = config.get_string( 'amazon-dump.statistics.measures.cache-dir') reviews_count_file = config.get_string( 'amazon-dump.statistics.measures.reviews-count.cache-file') number_of_reviews_by_asin_file = config.get_string( 'amazon-dump.statistics.measures.number-of-reviews-by-asin.cache-file') number_of_reviews_by_person_file = config.get_string( 'amazon-dump.statistics.measures.number-of-reviews-by-person.cache-file') bow_by_asin_file = config.get_string( 'amazon-dump.statistics.measures.bow-by-asin.cache-file') import pickle import os
from lib import helpers helpers.printCurrentTime("start ./amazon_meta_data.py") import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # CONFIG from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') amazon_dump_dir = config.get_string('amazon-dump.dir') movie_reviews_file = config.get_string('amazon-dump.files.reviews.movies') movie_meta_data_file = config.get_string('amazon-dump.files.meta-data.movies') text_min_len = config.get_int('amazon-dump.statistics.review.text.min-len') statistic_measures_cache_dir = config.get_string('amazon-dump.statistics.measures.cache-dir') reviews_count_file = config.get_string('amazon-dump.statistics.measures.reviews-count.cache-file') number_of_reviews_by_asin_file = config.get_string('amazon-dump.statistics.measures.number-of-reviews-by-asin.cache-file') number_of_reviews_by_person_file = config.get_string('amazon-dump.statistics.measures.number-of-reviews-by-person.cache-file') bow_by_asin_file = config.get_string('amazon-dump.statistics.measures.bow-by-asin.cache-file')
from lib import helpers helpers.printCurrentTime("start ./train_lsi.py") import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # CONFIG from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') amazon_dump_dir = config.get_string('amazon-dump.dir') movie_reviews_file = config.get_string('amazon-dump.files.reviews.movies') movie_meta_data_file = config.get_string('amazon-dump.files.meta-data.movies') bow_cache_dir = config.get_string('amazon-dump.bow.cache-dir') dictionary_cache_file = config.get_string('amazon-dump.bow.dictionary.cache-file') corpus_cache_file = config.get_string('amazon-dump.bow.corpus.cache-file') tfidf_cache_file = config.get_string('amazon-dump.bow.tfidf.cache-file') lsi_cache_file = config.get_string('amazon-dump.bow.lsi.cache-file') lsi_num_topics = config.get_int('amazon-dump.bow.lsi.num-topics') lda_cache_file = config.get_string('amazon-dump.bow.lda.cache-file') lda_num_topics = config.get_int('amazon-dump.bow.lda.num-topics') hdp_cache_file = config.get_string('amazon-dump.bow.hdp.cache-file')
from lib import helpers helpers.printCurrentTime("start ./train_doc2vec_concat.py") import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') amazon_dump_dir = config.get_string('amazon-dump.dir') movie_reviews_file = config.get_string('amazon-dump.files.reviews.movies') movie_meta_data_file = config.get_string('amazon-dump.files.meta-data.movies') model_size = config.get_int('amazon-dump.doc2vec.model.training.size') model_window = config.get_int('amazon-dump.doc2vec.model.training.window') model_min_count = config.get_int('amazon-dump.doc2vec.model.training.min_count') model_workers = config.get_int('amazon-dump.doc2vec.model.training.workers') model_cache_dir = config.get_string('amazon-dump.doc2vec.model.cache-dir') from gensim.models import doc2vec import gzip import nltk import os
from lib import helpers helpers.printCurrentTime("start ./run_evaluation.py") from load_perceptual_space import perceptual_space, amazon_ids from train_doc2vec import doc2vec_model, doc2vec_labels #from train_doc2vec_concat import doc2vec_model_concat from train_dictionary_corpus import dictionary, corpus from train_tfidf import tfidf, corpus_tfidf from train_lsi import lsi, corpus_tfidf_lsi #from train_lda import lda, corpus_lda from amazon_statistics import number_of_reviews_by_asin, bow_by_asin # Initialize all the models I need perceptual_space print("perceptual_space") doc2vec_model print("doc2vec_model") tfidf print("tfidf")
from lib import helpers helpers.printCurrentTime("start ./train_perceptual_space.py") import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # CONFIG from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') perceptual_space_dir = config.get_string('perceptual-space.file') content = [] with open(perceptual_space_dir, 'r') as f: content = f.readlines() perceptual_space = {} amazon_ids = [] amazon_titles = [] for line in content[1:]: # skip first line of content which are the labels
from lib import helpers helpers.printCurrentTime("start ./train_dictionary_corpus.py") import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # CONFIG from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') amazon_dump_dir = config.get_string('amazon-dump.dir') movie_reviews_file = config.get_string('amazon-dump.files.reviews.movies') movie_meta_data_file = config.get_string('amazon-dump.files.meta-data.movies') bow_cache_dir = config.get_string('amazon-dump.bow.cache-dir') dictionary_cache_file = config.get_string('amazon-dump.bow.dictionary.cache-file') corpus_cache_file = config.get_string('amazon-dump.bow.corpus.cache-file') tfidf_cache_file = config.get_string('amazon-dump.bow.tfidf.cache-file') lsi_cache_file = config.get_string('amazon-dump.bow.lsi.cache-file') lsi_num_topics = config.get_int('amazon-dump.bow.lsi.num-topics') lda_cache_file = config.get_string('amazon-dump.bow.lda.cache-file') lda_num_topics = config.get_int('amazon-dump.bow.lda.num-topics') hdp_cache_file = config.get_string('amazon-dump.bow.hdp.cache-file')
from lib import helpers helpers.printCurrentTime("start ./train_tfid.py") import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # CONFIG from pyhocon import ConfigFactory config = ConfigFactory.parse_file('./application.conf') amazon_dump_dir = config.get_string('amazon-dump.dir') movie_reviews_file = config.get_string('amazon-dump.files.reviews.movies') movie_meta_data_file = config.get_string('amazon-dump.files.meta-data.movies') bow_cache_dir = config.get_string('amazon-dump.bow.cache-dir') dictionary_cache_file = config.get_string('amazon-dump.bow.dictionary.cache-file') corpus_cache_file = config.get_string('amazon-dump.bow.corpus.cache-file') tfidf_cache_file = config.get_string('amazon-dump.bow.tfidf.cache-file') lsi_cache_file = config.get_string('amazon-dump.bow.lsi.cache-file') lsi_num_topics = config.get_int('amazon-dump.bow.lsi.num-topics') lda_cache_file = config.get_string('amazon-dump.bow.lda.cache-file') lda_num_topics = config.get_int('amazon-dump.bow.lda.num-topics') hdp_cache_file = config.get_string('amazon-dump.bow.hdp.cache-file') # BAG OF WORDS AS DICTIONARY import os