def manual_main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
Exemple #3
0
def exp_ae_visual_features():
    exp_name = 'ae_visual_features'
    out_base_dir = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_features_recon_loss_trained_on_google')
    exp_const = ExpConstants(exp_name, out_base_dir)
    exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log')
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    exp_const.batch_size = 10000
    exp_const.lr = 1e-2
    exp_const.num_epochs = 1000

    feature_dir = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_features_recon_loss_trained_on_google')
    data_const = VisualFeaturesDatasetConstants(feature_dir)

    model_const = Constants()
    model_const.encoder = EncoderConstants()
    model_const.encoder.output_dims = 300
    model_const.decoder = DecoderConstants()
    model_const.decoder.input_dims = 300

    train_ae_visual.main(exp_const, data_const, model_const)
Exemple #4
0
def calculate_topic_stability(records):

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = []

    context_extractor =\
        topic_model_creator.create_topic_model(records, None, None)
    terms_matrix = get_topic_model_terms(
        context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    all_term_rankings.append(terms_matrix)

    sample_ratio = 0.8

    print('Total iterations: %d' % Constants.TOPIC_MODEL_STABILITY_ITERATIONS)
    for _ in range(Constants.TOPIC_MODEL_STABILITY_ITERATIONS - 1):
        sampled_records = sample_list(records, sample_ratio)
        context_extractor = \
            topic_model_creator.train_context_extractor(sampled_records)
        terms_matrix = get_topic_model_terms(
            context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
        all_term_rankings.append(terms_matrix)

    return calculate_stability(all_term_rankings)
Exemple #5
0
def exp_train():
    exp_name = 'EXP_NAME'
    out_base_dir = os.path.join(os.getcwd(), 'symlinks/exp/EXP_GROUP')
    exp_const = ExpConstants(exp_name, out_base_dir)
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log')
    exp_const.vis_dir = os.path.join(exp_const.exp_dir, 'vis')
    exp_const.log_step = 10
    exp_const.model_save_step = 1000
    exp_const.val_step = 1000
    exp_const.num_val_samples = 1000
    exp_const.batch_size = 32
    exp_const.num_epochs = 1000
    exp_const.lr = 0.01
    exp_const.momentum = 0.9
    exp_const.num_workers = 5
    exp_const.optimizer = 'SGD'
    exp_const.subset = {'training': 'train', 'validation': 'val'}

    data_const = DATASET_CONSTANTS()

    model_const = Constants()
    model_const.model_num = None
    model_const.net = NET_CONSTANTS()
    model_const.net_path = os.path.join(exp_const.model_dir,
                                        f'net_{model_const.model_num}')

    train.main(exp_const, data_const, model_const)
Exemple #6
0
    def count_frequencies(self):
        """
        Counts the number of reviews each user and item have and stores the
        results in two separate files, one for the users and another one for the
        items. Note that the integer IDs are used and not the original user and
        item IDs
        """
        print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        user_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.USER_INTEGER_ID_FIELD)
        item_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.ITEM_INTEGER_ID_FIELD)

        user_frequency_file = Constants.generate_file_name(
            'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )
        item_frequency_file = Constants.generate_file_name(
            'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )

        ETLUtils.save_json_file(user_frequency_file, [user_frequency_map])
        ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])
Exemple #7
0
def exp_combine_glove_and_visual_features_with_ae():
    exp_name = 'ae_glove_and_visual'
    out_base_dir = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_embeddings_recon_loss_trained_on_google')
    exp_const = ExpConstants(exp_name, out_base_dir)
    exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log')
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    exp_const.batch_size = 10000
    exp_const.lr = 1e-2
    exp_const.num_epochs = 1000

    concat_embeddings_dir = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \
        'concat_glove_and_visual')
    data_const = ConcatEmbedDatasetConstants(concat_embeddings_dir)
    data_const.embeddings_h5py = os.path.join(data_const.concat_dir,
                                              'subset_visual_word_vecs.h5py')
    data_const.word_to_idx_json = os.path.join(
        data_const.concat_dir, 'subset_visual_word_vecs_idx.json')

    model_const = Constants()
    model_const.encoder = EncoderConstants()
    model_const.decoder = DecoderConstants()

    train_ae.main(exp_const, data_const, model_const)
Exemple #8
0
def create_topic_models():
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        reviews_preprocessor = ReviewsPreprocessor(use_cache=True)
        reviews_preprocessor.full_cycle()
def full_cycle():

    num_topics_list = [5, 10, 20, 40]
    # bow_type_list = [None, 'NN', 'JJ', 'VB']
    review_type_list = ['specific', 'generic']
    # num_topics_list = [10]
    bow_type_list = ['NN']
    results = []

    for num_topics, bow_type, review_type in itertools.product(
            num_topics_list, bow_type_list, review_type_list):

        Constants.update_properties({
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.BOW_TYPE_FIELD: bow_type,
            Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type
        })

        result = analyze_topics()
        result.update({
            Constants.BOW_TYPE_FIELD: bow_type,
            Constants.TOPIC_MODEL_TARGET_REVIEWS_FIELD: review_type
        })
        results.append(result)

    for result in results:
        print(result)

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE + \
        '_topic_model_context_richness'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from evaluation.context_top_n_runner import ContextTopNRunner

    print('\n\n************************\n************************\n')
    print('args', args)

    # Cast integer values
    args[Constants.FM_ITERATIONS_FIELD] = \
        int(args[Constants.FM_ITERATIONS_FIELD])
    args[Constants.FM_NUM_FACTORS_FIELD] = \
        int(args[Constants.FM_NUM_FACTORS_FIELD])
    if args[Constants.USE_CONTEXT_FIELD]:
        args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD])
        args[Constants.TOPIC_MODEL_PASSES_FIELD] = \
            int(args[Constants.TOPIC_MODEL_PASSES_FIELD])
        args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD])

    Constants.update_properties(args)

    # Finish updating parameters

    my_context_top_n_runner = ContextTopNRunner()
    results = my_context_top_n_runner.run()
    results['loss'] = -results[Constants.EVALUATION_METRIC]
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
Exemple #11
0
def main(**kwargs):
    exp_base_dir = coco_paths['exp_dir']
    if kwargs['dataset'] == 'flickr':
        exp_base_dir = flickr_paths['exp_dir']
    exp_const = ExpConstants(kwargs['exp_name'], exp_base_dir)
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    exp_const.seed = 0
    exp_const.contextualize = not kwargs['no_context']
    exp_const.random_lang = kwargs['random_lang']

    data_const = FlickrDatasetConstants(kwargs['subset'])

    model_const = Constants()
    model_const.model_num = kwargs['model_num']
    model_const.object_encoder = ObjectEncoderConstants()
    model_const.object_encoder.context_layer.output_attentions = True
    model_const.object_encoder.object_feature_dim = 2048
    model_const.cap_encoder = CapEncoderConstants()
    model_const.cap_encoder.output_attentions = True
    model_const.cap_info_nce_layers = kwargs['cap_info_nce_layers']
    if model_const.model_num == -100:
        filename = os.path.join(exp_const.exp_dir, f'results_val_best.json')
        results = io.load_json_object(filename)
        model_const.model_num = results['model_num']
        print('Selected model num:', model_const.model_num)

    model_const.object_encoder_path = os.path.join(
        exp_const.model_dir, f'object_encoder_{model_const.model_num}')
    model_const.lang_sup_criterion_path = os.path.join(
        exp_const.model_dir, f'lang_sup_criterion_{model_const.model_num}')
    if exp_const.random_lang is True:
        model_const.cap_encoder_path = os.path.join(
            exp_const.model_dir, f'cap_encoder_{model_const.model_num}')

    eval_flickr_phrase_loc.main(exp_const, data_const, model_const)
Exemple #12
0
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--numtopics',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
Exemple #13
0
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from evaluation.context_top_n_runner import ContextTopNRunner

    print('\n\n************************\n************************\n')
    print('args', args)

    # Cast integer values
    args[Constants.FM_ITERATIONS_FIELD] = \
        int(args[Constants.FM_ITERATIONS_FIELD])
    args[Constants.FM_NUM_FACTORS_FIELD] = \
        int(args[Constants.FM_NUM_FACTORS_FIELD])
    if args[Constants.USE_CONTEXT_FIELD]:
        args[Constants.TOPIC_MODEL_ITERATIONS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD])
        args[Constants.TOPIC_MODEL_PASSES_FIELD] = \
            int(args[Constants.TOPIC_MODEL_PASSES_FIELD])
        args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] = \
            int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD])

    Constants.update_properties(args)

    # Finish updating parameters

    my_context_top_n_runner = ContextTopNRunner()
    results = my_context_top_n_runner.run()
    results['loss'] = -results[Constants.EVALUATION_METRIC]
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
Exemple #14
0
def manual_main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)
def preprocess_data():
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        reviews_preprocessor = ReviewsPreprocessor(use_cache=True)
        reviews_preprocessor.full_cycle()
Exemple #17
0
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
def _exp_top_boxes_per_hoi(out_base_dir, data_const):
    args = parser.parse_args()
    not_specified_args = manage_required_args(
        args,
        parser,
        required_args=['model_num'],
        optional_args=[
            'verb_given_appearance',
            'verb_given_human_appearance',
            'verb_given_object_appearance',
            'verb_given_boxes_and_object_label',
            'verb_given_human_pose',
            'rcnn_det_prob'])

    exp_name = 'factors'
    if args.rcnn_det_prob:
        exp_name += '_rcnn_det_prob'
    if args.verb_given_appearance:
        exp_name += '_appearance'
    if args.verb_given_human_appearance:
        exp_name += '_human_appearance'
    if args.verb_given_object_appearance:
        exp_name += '_object_appearance'
    if args.verb_given_boxes_and_object_label:
        exp_name += '_boxes_and_object_label'
    if args.verb_given_human_pose:
        exp_name += '_human_pose'

    exp_const = ExpConstants(
        exp_name=exp_name,
        out_base_dir=out_base_dir)
    exp_const.model_dir = os.path.join(exp_const.exp_dir,'models')
    exp_const.num_to_vis = 10

    data_const.pred_hoi_dets_h5py = os.path.join(
        exp_const.exp_dir,
        f'pred_hoi_dets_test_{args.model_num}.hdf5')
    hoi_cand_dir = os.path.join(
        os.getcwd(),
        'data_symlinks/hico_exp/hoi_candidates')
    data_const.human_pose_feats_hdf5 = os.path.join(
        hoi_cand_dir,
        'human_pose_feats_test.hdf5')
    data_const.num_pose_keypoints = 18
    
    model_const = Constants()
    model_const.model_num = args.model_num
    model_const.hoi_classifier = HoiClassifierConstants()
    model_const.hoi_classifier.verb_given_appearance = args.verb_given_appearance
    model_const.hoi_classifier.verb_given_boxes_and_object_label = args.verb_given_boxes_and_object_label
    model_const.hoi_classifier.verb_given_human_pose = args.verb_given_human_pose
    model_const.hoi_classifier.rcnn_det_prob = args.rcnn_det_prob
    model_const.hoi_classifier.model_pth = os.path.join(
        exp_const.model_dir,
        f'hoi_classifier_{model_const.model_num}')

    vis_top_boxes_per_hoi.main(exp_const, data_const, model_const)
Exemple #19
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.load_records()

            if 'yelp' in Constants.ITEM_TYPE:
                self.transform_yelp_records()
            elif 'fourcity' in Constants.ITEM_TYPE:
                self.transform_fourcity_records()

            self.add_integer_ids()
            self.clean_reviews()
            self.remove_duplicate_reviews()
            self.tag_reviews_language()
            self.remove_foreign_reviews()
            self.lemmatize_records()
            self.remove_users_with_low_reviews()
            self.remove_items_with_low_reviews()
            self.count_frequencies()
            self.shuffle_records()
            print('total_records: %d' % len(self.records))
            self.classify_reviews()
            self.build_bag_of_words()
            self.tag_contextual_reviews()
            # self.load_full_records()
            self.build_dictionary()
            self.build_corpus()
            self.label_review_targets()
            self.export_records()

        self.count_specific_generic_ratio()
        # self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None, None,
        False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
Exemple #21
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
Exemple #22
0
def main(**kwargs):
    exp_base_dir = coco_paths['exp_dir']
    if kwargs['dataset'] == 'flickr':
        exp_base_dir = flickr_paths['exp_dir']
    exp_const = ExpConstants(kwargs['exp_name'], exp_base_dir)
    exp_const.log_dir = os.path.join(exp_const.exp_dir, 'logs')
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    exp_const.vis_dir = os.path.join(exp_const.exp_dir, 'vis')
    exp_const.dataset = kwargs['dataset']
    exp_const.optimizer = 'Adam'
    exp_const.lr = kwargs['lr']
    exp_const.momentum = None
    exp_const.num_epochs = 10
    exp_const.log_step = 20
    # Save models approx. twice every epoch
    exp_const.model_save_step = 400000 // (2 * kwargs['train_batch_size']
                                           )  # 4000=400000/(2*50)
    if exp_const.dataset == 'flickr':
        exp_const.model_save_step = 150000 // (2 * kwargs['train_batch_size'])
    val_freq_factor = 2
    if kwargs['val_frequently'] is True:
        val_freq_factor = 1
    exp_const.val_step = val_freq_factor * exp_const.model_save_step  # set to 1*model_save_step for plotting mi vs perf
    exp_const.num_val_samples = None
    exp_const.train_batch_size = kwargs['train_batch_size']
    exp_const.val_batch_size = 20
    exp_const.num_workers = 10
    exp_const.seed = 0
    exp_const.neg_noun_loss_wt = kwargs['neg_noun_loss_wt']
    exp_const.self_sup_loss_wt = kwargs['self_sup_loss_wt']
    exp_const.lang_sup_loss_wt = kwargs['lang_sup_loss_wt']
    exp_const.contextualize = not kwargs['no_context']
    exp_const.random_lang = kwargs['random_lang']

    DatasetConstants = CocoDatasetConstants
    if exp_const.dataset == 'flickr':
        DatasetConstants = FlickrDatasetConstants

    data_const = {
        'train': DatasetConstants('train'),
        'val': DatasetConstants('val'),
    }

    model_const = Constants()
    model_const.model_num = kwargs['model_num']
    model_const.object_encoder = ObjectEncoderConstants()
    model_const.object_encoder.context_layer.output_attentions = True
    model_const.object_encoder.object_feature_dim = 2048
    model_const.cap_encoder = CapEncoderConstants()
    model_const.cap_encoder.output_attentions = True
    model_const.cap_info_nce_layers = kwargs['cap_info_nce_layers']
    model_const.object_encoder_path = os.path.join(
        exp_const.model_dir, f'object_encoder_{model_const.model_num}')
    model_const.self_sup_criterion_path = os.path.join(
        exp_const.model_dir, f'self_sup_criterion_{model_const.model_num}')
    model_const.lang_sup_criterion_path = os.path.join(
        exp_const.model_dir, f'lang_sup_criterion_{model_const.model_num}')

    train(exp_const, data_const, model_const)
def _exp_eval(out_base_dir, data_const):
    args = parser.parse_args()
    not_specified_args = manage_required_args(
        args,
        parser,
        required_args=['model_num'],
        optional_args=[
            'verb_given_appearance',
            'verb_given_human_appearance',
            'verb_given_object_appearance',
            'verb_given_boxes_and_object_label',
            'verb_given_human_pose',
            'rcnn_det_prob'])

    exp_name = 'factors'
    if args.rcnn_det_prob:
        exp_name += '_rcnn_det_prob'
    if args.verb_given_appearance:
        exp_name += '_appearance'
    if args.verb_given_human_appearance:
        exp_name += '_human_appearance'
    if args.verb_given_object_appearance:
        exp_name += '_object_appearance'
    if args.verb_given_boxes_and_object_label:
        exp_name += '_boxes_and_object_label'
    if args.verb_given_human_pose:
        exp_name += '_human_pose'

    exp_const = ExpConstants(
        exp_name=exp_name,
        out_base_dir=out_base_dir)
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    data_const.balanced_sampling = False
    
    model_const = Constants()
    model_const.model_num = args.model_num
    model_const.hoi_classifier = HoiClassifierConstants()
    model_const.hoi_classifier.verb_given_appearance = args.verb_given_appearance
    model_const.hoi_classifier.verb_given_human_appearance = args.verb_given_human_appearance
    model_const.hoi_classifier.verb_given_object_appearance = args.verb_given_object_appearance
    model_const.hoi_classifier.verb_given_boxes_and_object_label = args.verb_given_boxes_and_object_label
    model_const.hoi_classifier.verb_given_human_pose = args.verb_given_human_pose
    model_const.hoi_classifier.rcnn_det_prob = args.rcnn_det_prob
    model_const.hoi_classifier.model_pth = os.path.join(
        exp_const.model_dir,
        f'hoi_classifier_{model_const.model_num}')

    if isinstance(data_const, FeatureConstantsVcoco):
        data_sign = 'vcoco'
    else:
        data_sign = 'hico'
    evaluate.main(exp_const, data_const, model_const, data_sign)
Exemple #24
0
def exp_concat_random_with_glove():
    exp_name = 'concat_with_glove_100' # alt. xformed_
    out_base_dir = os.path.join(
        os.getcwd(),
        'symlinks/exp/multi_sense_cooccur/linear_100')
    exp_const = ExpConstants(exp_name,out_base_dir)
    exp_const.random_dim = 100

    data_const = Constants()
    glove_const = GloveConstantsFactory.create(dim='100')
    data_const.glove_idx = glove_const.word_to_idx_json
    data_const.glove_h5py = glove_const.embeddings_h5py

    concat_random_with_glove.main(exp_const,data_const)
Exemple #25
0
    def run_single_fold(self, parameters):

        fold = parameters['fold']

        Constants.update_properties(parameters)

        Constants.print_properties()

        utilities.plant_seeds()
        self.load()

        records = self.original_records

        # self.plant_seeds()
        total_cycle_time = 0.0
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        split = 1 - (1 / float(num_folds))
        self.records = copy.deepcopy(records)
        if Constants.SHUFFLE_DATA:
            self.shuffle(self.records)

        fold_start = time.time()
        cv_start = float(fold) / num_folds
        print('\nFold: %d/%d' % ((fold + 1), num_folds))

        self.create_tmp_file_names(0, fold)
        self.train_records, self.test_records = \
            ETLUtils.split_train_test_copy(
                self.records, split=split, start=cv_start)
        # subsample_size = int(len(self.train_records)*0.5)
        # self.train_records = self.train_records[:subsample_size]
        self.get_records_to_predict(True)
        if Constants.USE_CONTEXT:
            if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
                self.load_cache_context_topics(None, None)
            else:
                context_extractor = self.train_topic_model(0, fold)
                self.find_reviews_topics(context_extractor, 0, fold)
        else:
            self.context_rich_topics = []
        self.predict()
        metrics = self.evaluate()

        fold_end = time.time()
        fold_time = fold_end - fold_start
        total_cycle_time += fold_time
        self.clear()
        print("Total fold %d time = %f seconds" % ((fold + 1), fold_time))

        return metrics
Exemple #26
0
def exp_extract_embeddings():
    args = parser.parse_args()
    not_specified_args = manage_required_args(
        args,
        parser,
        required_args=[
            'embed_dim',
            'xform',
            'model_num',
            'syn'])

    exp_name = f'{args.xform}_{args.embed_dim}'
    out_base_dir = os.path.join(
        os.getcwd(),
        'symlinks/exp/multi_sense_cooccur')
    exp_const = ExpConstants(exp_name,out_base_dir)
    exp_const.model_dir = os.path.join(exp_const.exp_dir,'models')
    exp_const.cooccur_types = [
        'syn',
        'attr_attr',
        'obj_attr',
        'obj_hyp',
        'context'
    ]
    if args.syn==False:
        exp_const.cooccur_types = exp_const.cooccur_types[1:]

    data_const = MultiSenseCooccurDatasetConstants()
    data_const.cooccur_csv = os.path.join(
        os.getcwd(),
        'symlinks/exp/multi_sense_cooccur/cooccurrences/merged_cooccur.csv')

    model_const = Constants()
    model_const.model_num = args.model_num
    model_const.net = LogBilinearConstants()
    model_const.net.num_words = 93553
    model_const.net.embed_dims = args.embed_dim
    model_const.net.two_embedding_layers = False
    model_const.net.xform_type = args.xform
    model_const.net.xform_num_layers = None
    model_const.net.use_bias = True
    model_const.net.use_fx = False
    model_const.net.cooccur_types = copy.deepcopy(exp_const.cooccur_types)
    model_const.net_path = os.path.join(
        exp_const.model_dir,
        f'net_{model_const.model_num}')

    extract_embeddings.main(exp_const,data_const,model_const)
    extract_embeddings_xformed.main(exp_const,data_const,model_const)
def _exp_train(out_base_dir,
               data_const_train,
               data_const_val,
               data_sign='hico'):
    args = parser.parse_args()
    not_specified_args = manage_required_args(
        args,
        parser,
        required_args=['imgs_per_batch', 'fp_to_tp_ratio'],
        optional_args=[
            'verb_given_appearance', 'verb_given_human_appearance',
            'verb_given_object_appearance',
            'verb_given_boxes_and_object_label', 'verb_given_human_pose',
            'rcnn_det_prob'
        ])

    exp_name = 'factors'
    if args.rcnn_det_prob:
        exp_name += '_rcnn_det_prob'
    if args.verb_given_appearance:
        exp_name += '_appearance'
    if args.verb_given_human_appearance:
        exp_name += '_human_appearance'
    if args.verb_given_object_appearance:
        exp_name += '_object_appearance'
    if args.verb_given_boxes_and_object_label:
        exp_name += '_boxes_and_object_label'
    if args.verb_given_human_pose:
        pose = '_human_pose'
        exp_name += pose

    exp_const = ExpConstants(exp_name=exp_name, out_base_dir=out_base_dir)
    exp_const.log_dir = os.path.join(exp_const.exp_dir, 'log')
    exp_const.model_dir = os.path.join(exp_const.exp_dir, 'models')
    exp_const.num_epochs = 10
    exp_const.imgs_per_batch = args.imgs_per_batch
    exp_const.lr = 1e-3

    model_const = Constants()
    model_const.hoi_classifier = HoiClassifierConstants(data_sign)
    model_const.hoi_classifier.verb_given_appearance = args.verb_given_appearance
    model_const.hoi_classifier.verb_given_human_appearance = args.verb_given_human_appearance
    model_const.hoi_classifier.verb_given_object_appearance = args.verb_given_object_appearance
    model_const.hoi_classifier.verb_given_boxes_and_object_label = args.verb_given_boxes_and_object_label
    model_const.hoi_classifier.verb_given_human_pose = args.verb_given_human_pose
    model_const.hoi_classifier.rcnn_det_prob = args.rcnn_det_prob

    train.main(exp_const, data_const_train, data_const_val, model_const,
               data_sign)
def run_tests():

    combined_parameters = parameter_combinator.get_combined_parameters()

    test_cycle = 1
    num_tests = len(combined_parameters)
    for properties in combined_parameters:
        Constants.update_properties(properties)
        context_top_n_runner = WordContextTopNRunner()

        print('\n\n******************\nTest %d/%d\n******************\n' %
              (test_cycle, num_tests))

        context_top_n_runner.perform_cross_validation()
        test_cycle += 1
Exemple #29
0
def run_tests():

    combined_parameters = parameter_combinator.hotel_context_parameters()

    test_cycle = 1
    num_tests = len(combined_parameters)
    for properties in combined_parameters:
        Constants.update_properties(properties)
        context_top_n_runner = ContextTopNRunner()

        print('\n\n******************\nTest %d/%d\n******************\n' %
              (test_cycle, num_tests))

        context_top_n_runner.perform_cross_validation()
        test_cycle += 1
Exemple #30
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.preprocess()

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
Exemple #31
0
    def evaluate(self, settings, time_file=None):

        constants = Constants()
        if not settings.random_agent:
            net, tmp_net, init = self.get_model(
                settings
            )  # Initialize various constants for 3D reconstruction.
            saver = tf.train.Saver()

        # Tensorboard saver.
        #saver = tf.train.Saver()

        sess = None
        if not settings.random_agent:
            config = tf.ConfigProto()

            sess = tf.Session(config=config)

        if settings.random_agent:
            agent = RandomAgent()
        else:
            sess.run(init)
            self.load_net_weights(saver, sess, settings, latest=False)

            sess.graph.finalize()
            agent = NetAgent(net, None)
            net.set_session(sess)

        if settings.carla:
            self.evaluate_method_carla(agent, constants, sess, settings)

        else:
            self.evaluate_methods_cs(agent, constants, sess, settings)
def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])
def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])
Exemple #34
0
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(
        prefix, '', folder, None, None, True, True)[:-1]
Exemple #37
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')

    args = parser.parse_args()
    fold = args.fold[0]

    new_properties = {
        Constants.NESTED_CROSS_VALIDATION_CYCLE_FIELD: fold,
        Constants.CROSS_VALIDATION_STRATEGY_FIELD: 'nested_validate'
    }

    Constants.update_properties(new_properties)

    context_top_n_runner.run_tests()
def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(prefix, '', folder, None, None, True,
                                        True)[:-1]
Exemple #39
0
def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file
    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None,
        None, False)
    json_file_name = Constants.generate_file_name(
        metric, 'json', Constants.RESULTS_FOLDER, None,
        None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
Exemple #41
0
def topic_stability_main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    # num_topic_list = range(2, 101)
    num_topic_list = [2, 5]
    results = {}
    for num_topics in num_topic_list:
        new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}
        Constants.update_properties(new_properties)
        results[num_topics] = calculate_topic_stability(records)

    print('Results:')
    for num_topics in num_topic_list:
        scores = results[num_topics]
        print('%d: %.4f [%.4f,%.4f]' %
              (num_topics, numpy.nanmean(scores), numpy.nanmin(scores),
               numpy.nanmax(scores)))
Exemple #42
0
def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file

    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)
Exemple #43
0
def load_topic_model(cycle_index, fold_index):
    file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
    print(file_path)
    with open(file_path, 'rb') as read_file:
        topic_model = pickle.load(read_file)
    return topic_model
def create_all_term_rankings(records, metric):
    print('%s: creating all term rankings' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    all_term_rankings = []

    # context_extractor =\
    #     topic_model_creator.create_topic_model(records, None, None)
    # terms_matrix = get_topic_model_terms(
    #     context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    # all_term_rankings.append(terms_matrix)

    context_extractor = \
        topic_model_creator.train_context_extractor(records, False)
    terms_matrix = get_topic_model_terms(
        context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
    all_term_rankings.append(terms_matrix)

    sample_ratio = Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO

    if metric in [TERM_STABILITY_PAIRWISE, TERM_DIFFERENCE]:
        sample_ratio = None
        Constants.update_properties(
            {Constants.TOPIC_MODEL_STABILITY_SAMPLE_RATIO_FIELD: sample_ratio})
        msg = 'Warning: Since the metric is \'%s\' I have updated the ' \
              'topic_model_stability_sample_ratio value to None' % metric
        print(msg)

    num_iterations = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    for i in range(num_iterations - 1):
        print('Iteration %d/%d' % (i+1, num_iterations))
        print('sample_ratio:', sample_ratio)

        if sample_ratio is None:
            sampled_records = records
        else:
            sampled_records = sample_list(records, sample_ratio)
        context_extractor = \
            topic_model_creator.train_context_extractor(sampled_records, False)
        terms_matrix = get_topic_model_terms(
            context_extractor, Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)
        all_term_rankings.append(terms_matrix)

    return all_term_rankings
def test():
    document_term_matrix = NmfTopicExtractor.load_document_term_matrix()

    results = []

    # my_list = range(2, 31)
    my_list = range(2, 61)

    for i in my_list:
        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()

        document_topic_matrix = topic_model.document_topic_matrix
        topic_term_matrix = topic_model.topic_term_matrix

        divergence = calculate_divergence(
            document_term_matrix, document_topic_matrix, topic_term_matrix)

        result = {
            'num_topics': Constants.TOPIC_MODEL_NUM_TOPICS,
            'divergence': divergence,
            Constants.TOPIC_MODEL_TYPE_FIELD: 'ensemble',
            Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
        }

        results.append(result)

        print('Num topics: %d, Divergence: %f' %
              (Constants.TOPIC_MODEL_NUM_TOPICS, divergence))

    for result in results:
        print('%d %f' % (result['num_topics'], result['divergence']))

    prefix = Constants.RESULTS_FOLDER + Constants.ITEM_TYPE +\
        '_topic_model_divergence'
    csv_file_path = prefix + '.csv'
    json_file_path = prefix + '.json'
    headers = sorted(results[0].keys())
    ETLUtils.save_csv_file(csv_file_path, results, headers)
    ETLUtils.save_json_file(json_file_path, results)
def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(
        metric, 'csv', BASE_FOLDER, None, None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend([
        '-o',
        csv_file
    ])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(
        command, stdout=log_file, cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results
    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" % str(document_term_matrix.shape))

        return document_term_matrix
def create_topic_model(num_topics):
    print('%s: evaluating topic model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10,
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics
    })
    utilities.plant_seeds()
    Constants.print_properties()

    file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

    if os.path.exists(file_path):
        print('Ensemble topic model already exists')
        return

    # topic_ensemble_caller.run_local_parse_directory()
    topic_ensemble_caller.run_generate_kfold()
    topic_ensemble_caller.run_combine_nmf()
def create_single_topic_model(cycle_index, fold_index, check_exists=True):

    Constants.print_properties()
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        msg = 'This function shouldn\'t be used when the ' \
              'separate_topic_model_recsys_reviews property is set to True'
        raise ValueError(msg)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
        pass
    elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
        split = 1 - (1 / float(num_folds))
        cv_start = float(cycle) / num_folds
        print('cv_start', cv_start)
        records, _ = ETLUtils.split_train_test(records, split, cv_start)
    else:
        raise ValueError('Unknown cross-validation strategy')

    utilities.plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    return create_topic_model(
        train_records, cycle_index, fold_index, check_exists)
Exemple #50
0
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
Exemple #51
0
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError(
            'Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def run_recommender(args):
    import sys
    # sys.path.append('/Users/fpena/UCC/Thesis/projects/yelp/source/python')
    sys.path.append('/home/fpena/yelp/source/python')
    from utils.constants import Constants
    from topicmodeling.context import topic_model_analyzer

    print('\n\n************************\n************************\n')
    print('args', args)

    parameters = {
        Constants.BUSINESS_TYPE_FIELD: args[Constants.BUSINESS_TYPE_FIELD],
        # 'lda_alpha': args['lda_alpha'],
        # 'lda_beta': args['lda_beta'],
        Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD:
            args[Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD],
        Constants.TOPIC_MODEL_ITERATIONS_FIELD:
            int(args[Constants.TOPIC_MODEL_ITERATIONS_FIELD]),
        Constants.TOPIC_MODEL_PASSES_FIELD:
            int(args[Constants.TOPIC_MODEL_PASSES_FIELD]),
        Constants.TOPIC_MODEL_NUM_TOPICS_FIELD:
            int(args[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD]),
        # 'topic_weighting_method': args['topic_weighting_method'],
        Constants.USE_CONTEXT_FIELD: args[Constants.USE_CONTEXT_FIELD]
    }

    Constants.update_properties(parameters)
    # Finish updating parameters

    results = topic_model_analyzer.export_topics()
    results['loss'] = -results['combined_score']
    results['status'] = 'ok'

    print('loss', results['loss'])

    return results
def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths
def create_topic_model(records, cycle_index, fold_index, check_exists=True):

    print('%s: Create topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    topic_model_file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

    print(topic_model_file_path)

    if check_exists and os.path.exists(topic_model_file_path):
        print('WARNING: Topic model already exists')
        return load_topic_model(cycle_index, fold_index)

    topic_model = train_context_extractor(records)

    with open(topic_model_file_path, 'wb') as write_file:
        pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    return topic_model
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER,
            None, None, True, True, uses_carskit=False, normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)