Esempi in Python per Constants.generate_file_name, esempi in Python per utils.constants.Constants.generate_file_name

Esempio n. 1

0

Mostra file

File: topic_model_analyzer.py Progetto: melqkiades/yelp

def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)

Esempio n. 2

0

Mostra file

File: topic_model_analyzer.py Progetto: melqkiades/yelp

def manual_main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1

Esempio n. 3

0

Mostra file

File: topic_model_analyzer.py Progetto: swarnamd/yelp

def manual_main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1

Esempio n. 4

0

Mostra file

File: topic_model_analyzer.py Progetto: swarnamd/yelp

def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--numtopics',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)

Esempio n. 5

0

Mostra file

    def count_frequencies(self):
        """
        Counts the number of reviews each user and item have and stores the
        results in two separate files, one for the users and another one for the
        items. Note that the integer IDs are used and not the original user and
        item IDs
        """
        print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        user_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.USER_INTEGER_ID_FIELD)
        item_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.ITEM_INTEGER_ID_FIELD)

        user_frequency_file = Constants.generate_file_name(
            'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )
        item_frequency_file = Constants.generate_file_name(
            'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )

        ETLUtils.save_json_file(user_frequency_file, [user_frequency_map])
        ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])

Esempio n. 6

0

Mostra file

File: context_transformer.py Progetto: swarnamd/yelp

    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)

Esempio n. 7

0

Mostra file

File: classifier_evaluator.py Progetto: melqkiades/yelp

def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])

Esempio n. 8

0

Mostra file

File: classifier_evaluator.py Progetto: srividya89/yelp

def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])

Esempio n. 9

0

Mostra file

File: topic_model_stability.py Progetto: melqkiades/yelp

def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None,
        None, False)
    json_file_name = Constants.generate_file_name(
        metric, 'json', Constants.RESULTS_FOLDER, None,
        None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)

Esempio n. 10

0

Mostra file

File: topic_model_stability.py Progetto: swarnamd/yelp

def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)

Esempio n. 11

0

Mostra file

File: topic_ensemble_caller.py Progetto: melqkiades/yelp

def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(
        prefix, '', folder, None, None, True, True)[:-1]

Esempio n. 12

0

Mostra file

File: topic_ensemble_caller.py Progetto: swarnamd/yelp

def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(prefix, '', folder, None, None, True,
                                        True)[:-1]

Esempio n. 13

0

Mostra file

def load_topic_model(cycle_index, fold_index):
    file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
    print(file_path)
    with open(file_path, 'rb') as read_file:
        topic_model = pickle.load(read_file)
    return topic_model

Esempio n. 14

0

Mostra file

File: classifier_evaluator.py Progetto: swarnamd/yelp

def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv,
                                 SCORE_METRIC).mean()
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(x_matrix, y_vector,
                                     PARAM_GRID_MAP[best_classifier], cv,
                                     SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)

    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(csv_file, results, results[0].keys())
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)

Esempio n. 15

0

Mostra file

def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)

Esempio n. 16

0

Mostra file

File: topic_model_eval_caller.py Progetto: melqkiades/yelp

def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None, None,
        False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)

Esempio n. 17

0

Mostra file

File: topic_model_eval_caller.py Progetto: melqkiades/yelp

def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(
        metric, 'csv', BASE_FOLDER, None, None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend([
        '-o',
        csv_file
    ])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(
        command, stdout=log_file, cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results

Esempio n. 18

0

Mostra file

File: nmf_topic_extractor.py Progetto: melqkiades/yelp

    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" % str(document_term_matrix.shape))

        return document_term_matrix

Esempio n. 19

0

Mostra file

    def separate_recsys_topic_model_records(self):

        print('%s: separate_recsys_topic_model_records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        num_records = len(self.records)
        topic_model_records = self.records[:num_records / 2]

        if not Constants.USE_CONTEXT:
            recsys_records = self.records[num_records / 2:]

            file_name = \
                Constants.generate_file_name(
                    'recsys_contextual_records', 'json', Constants.CACHE_FOLDER,
                    None, None, False, True)

            print('Records without context file: %s' % file_name)

            for record in recsys_records:
                record[Constants.CONTEXT_TOPICS_FIELD] = {'na': 1.0}

            ETLUtils.save_json_file(file_name, recsys_records)
            return

        topic_model_creator.train_topic_model(topic_model_records)

        if os.path.exists(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE):
            print('Recsys topic records have already been generated')
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        else:
            recsys_records = self.records[num_records / 2:]
            self.find_topic_distribution(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE, recsys_records)

        if os.path.exists(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE):
            print('Recsys contextual records have already been generated')
            print(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
        else:
            self.update_context_topics(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE,
                recsys_records
            )

        context_transformer = ContextTransformer(recsys_records)
        context_transformer.load_data()
        context_transformer.transform_records()
        context_transformer.export_records()

Esempio n. 20

0

Mostra file

def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(metric, 'csv', BASE_FOLDER, None,
                                            None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend(['-o', csv_file])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(command,
                         stdout=log_file,
                         cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results

Esempio n. 21

0

Mostra file

File: nmf_topic_extractor.py Progetto: swarnamd/yelp

    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None,
            False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" %
              str(document_term_matrix.shape))

        return document_term_matrix

Esempio n. 22

0

Mostra file

File: main.py Progetto: swarnamd/yelp

def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)

Esempio n. 23

0

Mostra file

File: thesis_charts.py Progetto: sarthikadhawan/yelp

def plot_ats_score():
    metric = 'term_difference'
    metric = 'term_stability_pairwise'

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    data_frame = pandas.read_csv(csv_file_name)
    stability_column = 'term_stability_pairwise_mean'
    topic_model_column = 'Topic modeling algorithm'
    num_topics_field = Constants.TOPIC_MODEL_NUM_TOPICS_FIELD

    data_frame.rename(columns={'topic_model_type': topic_model_column},
                      inplace=True)
    data_frame[topic_model_column] = data_frame[topic_model_column].map({
        'lda':
        'LDA',
        'nmf':
        'NMF',
        'ensemble':
        'Ensemble'
    })

    g = seaborn.barplot(x=num_topics_field,
                        y=stability_column,
                        hue=topic_model_column,
                        data=data_frame)
    g.set(xlabel='Number of topics', ylabel='ATS')
    plt.ylim(0, 1.18)
    # g.ylim(10, 40)

    output_folder = Constants.RESULTS_FOLDER + 'pdf/'
    file_name = output_folder + Constants.ITEM_TYPE + '_ats.pdf'
    g.figure.savefig(file_name)

Esempio n. 24

0

Mostra file

File: libfm_caller.py Progetto: sarthikadhawan/yelp

def main():
    print('%s: Making preidctions with LibFM' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    prediction_type_map = {
        'user_test': 'rating',
        'test_items': 'rating',
        'rel_plus_n': 'ranking'
    }
    prediction_type = prediction_type_map[Constants.RIVAL_EVALUATION_STRATEGY]
    use_cache = True

    libfm_ratings_fold_folder = Constants.generate_file_name(
        'recsys_formatted_context_records',
        '',
        Constants.CACHE_FOLDER + 'rival/',
        None,
        None,
        True,
        True,
        uses_carskit=False,
        normalize_topics=True,
        format_context=True)[:-1] + '/fold_%d/'

    for fold in range(Constants.CROSS_VALIDATION_NUM_FOLDS):

        ratings_fold_folder = libfm_ratings_fold_folder % fold
        # ratings_fold_folder = Constants.CACHE_FOLDER + 'rival/contextaa/fold_%d/' % fold
        train_file = ratings_fold_folder + 'libfm_train.libfm'
        predictions_file = ratings_fold_folder + 'libfm_predictions_' + \
                    prediction_type + '.libfm'
        fm_num_factors = Constants.FM_NUM_FACTORS
        results_file = ratings_fold_folder + 'libfm_results_' + \
            prediction_type + '_fmfactors-' + str(fm_num_factors) + '.txt'

        if use_cache and os.path.exists(results_file):
            print("Fold %d file already exists ('%s') " % (fold, results_file))
            continue

        # predictions_file = ratings_fold_folder + 'libfm_test.libfm'
        # results_file = ratings_fold_folder + 'libfm_predictions.txt'
        log_file = ratings_fold_folder + 'libfm_log.txt'
        save_file = ratings_fold_folder + 'libfm_model.txt'

        if not os.path.exists(ratings_fold_folder):
            os.makedirs(ratings_fold_folder)

        run_libfm(train_file, predictions_file, results_file, log_file,
                  save_file)

Esempio n. 25

0

Mostra file

    def train_topic_model(self, cycle_index, fold_index):

        context_extractor = topic_model_creator.create_topic_model(
            self.train_records, cycle_index, fold_index)
        self.context_rich_topics = context_extractor.context_rich_topics

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)
        ETLUtils.save_json_file(topics_file_path,
                                [dict(self.context_rich_topics)])
        print('Trained Context Extractor: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        return context_extractor

Esempio n. 26

0

Mostra file

    def load_context_reviews(self, cycle_index, fold_index):

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
        important_records_file_path = Constants.generate_file_name(
            'context_important_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

        self.train_records = ETLUtils.load_json_file(train_records_file_path)
        self.important_records = \
            ETLUtils.load_json_file(important_records_file_path)
        self.load_cache_context_topics(cycle_index, fold_index)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        # self.train_records = self.filter_context_words(self.train_records)
        # self.print_context_topics(self.important_records)

        self.important_records = None
        gc.collect()

Esempio n. 27

0

Mostra file

File: carskit_caller.py Progetto: melqkiades/yelp

def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')

Esempio n. 28

0

Mostra file

def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')

Esempio n. 29

0

Mostra file

File: topic_model_stability.py Progetto: swarnamd/yelp

def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths

Esempio n. 30

0

Mostra file

File: topic_model_stability.py Progetto: melqkiades/yelp

def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths

Esempio n. 31

0

Mostra file

File: carskit_caller.py Progetto: swarnamd/yelp

def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file
    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)

Esempio n. 32

0

Mostra file

    def load_cache_context_topics(self, cycle_index, fold_index):

        print('load cache context topics: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)

        self.context_rich_topics = sorted(
            ETLUtils.load_json_file(topics_file_path)[0].items(),
            key=operator.itemgetter(1),
            reverse=True)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

Esempio n. 33

0

Mostra file

File: carskit_caller.py Progetto: melqkiades/yelp

def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file

    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)

Esempio n. 34

0

Mostra file

def create_topic_model(records, cycle_index, fold_index, check_exists=True):

    print('%s: Create topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    topic_model_file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

    print(topic_model_file_path)

    if check_exists and os.path.exists(topic_model_file_path):
        print('WARNING: Topic model already exists')
        return load_topic_model(cycle_index, fold_index)

    topic_model = train_context_extractor(records)

    with open(topic_model_file_path, 'wb') as write_file:
        pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    return topic_model

Esempio n. 35

0

Mostra file

File: context_transformer.py Progetto: melqkiades/yelp

    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER,
            None, None, True, True, uses_carskit=False, normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)

Esempio n. 36

0

Mostra file

def train_topic_model(records):
    print('%s: train topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.TOPIC_MODEL_TYPE == 'lda':

        topic_model_file_path = \
            Constants.generate_file_name(
                'topic_model', 'pkl', Constants.CACHE_FOLDER, None, None, True)
        if os.path.exists(topic_model_file_path):
            print('WARNING: Topic model already exists')
            return

        corpus = \
            [record[Constants.CORPUS_FIELD] for record in records]
        dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE)
        topic_model = ldamodel.LdaModel(
            corpus, id2word=dictionary,
            num_topics=Constants.TOPIC_MODEL_NUM_TOPICS,
            passes=Constants.TOPIC_MODEL_PASSES,
            iterations=Constants.TOPIC_MODEL_ITERATIONS)

        with open(topic_model_file_path, 'wb') as write_file:
            pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    elif Constants.TOPIC_MODEL_TYPE == 'ensemble':
        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                    "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

        if os.path.exists(file_path):
            print('Ensemble topic model already exists')
            return

        export_to_text(records)
        topic_ensemble_caller.run_local_parse_directory()
        topic_ensemble_caller.run_generate_kfold()
        topic_ensemble_caller.run_combine_nmf()

    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         Constants.TOPIC_MODEL_TYPE)

Esempio n. 37

0

Mostra file

File: nmf_topic_extractor.py Progetto: swarnamd/yelp

    def load_trained_data(self):

        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
            "factors_final_k%02d.pkl" % self.num_topics
        W, H, doc_ids, terms = load_nmf_factors(file_path)
        self.topic_term_matrix = H
        self.document_topic_matrix = W
        self.terms = terms

        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        tfidf_file_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None,
            False)[:-1] + '_tfidf.pkl'

        self.tfidf_vectorizer = load_tfidf(tfidf_file_path)

        # print('tfidf vectorizer', self.tfidf_vectorizer)

        print "Loaded factor W of size %s and factor H of size %s" % (str(
            self.document_topic_matrix.shape), str(
                self.topic_term_matrix.shape))

Esempio n. 38

0

Mostra file

File: nmf_topic_extractor.py Progetto: melqkiades/yelp

    def load_trained_data(self):

        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
            "factors_final_k%02d.pkl" % self.num_topics
        W, H, doc_ids, terms = load_nmf_factors(file_path)
        self.topic_term_matrix = H
        self.document_topic_matrix = W
        self.terms = terms

        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        tfidf_file_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '_tfidf.pkl'

        self.tfidf_vectorizer = load_tfidf(tfidf_file_path)

        # print('tfidf vectorizer', self.tfidf_vectorizer)

        print "Loaded factor W of size %s and factor H of size %s" % (
            str(self.document_topic_matrix.shape),
            str(self.topic_term_matrix.shape)
        )

Esempio n. 39

0

Mostra file

    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()

Esempio n. 40

0

Mostra file

File: topic_model_analyzer.py Progetto: melqkiades/yelp

def generate_excel_file(records, file_name=None):
    my_context_words = []
    if 'hotel' in Constants.ITEM_TYPE:
        for values in grouped_hotel_context_words.values():
            my_context_words.extend(values)
    elif 'restaurant' in Constants.ITEM_TYPE:
        for values in grouped_restaurant_context_words.values():
            my_context_words.extend(values)

    if file_name is None:
        file_name = Constants.generate_file_name(
            'topic_model', 'xlsx', Constants.RESULTS_FOLDER, None, None, True)
    workbook = xlsxwriter.Workbook(file_name)
    worksheet7 = workbook.add_worksheet()

    yellow_format = workbook.add_format()
    yellow_format.set_pattern(1)  # This is optional when using a solid fill.
    yellow_format.set_bg_color('yellow')

    cyan_format = workbook.add_format()
    cyan_format.set_pattern(1)  # This is optional when using a solid fill.
    cyan_format.set_bg_color('cyan')

    green_format = workbook.add_format()
    green_format.set_pattern(1)  # This is optional when using a solid fill.
    green_format.set_bg_color('green')

    headers = [
        'topic_id',
        'ratio',
        'probability_score',
        'weighted_frequency'
    ]
    num_headers = len(headers)
    for i in range(Constants.TOPIC_MODEL_STABILITY_NUM_TERMS):
        headers.append('word' + str(i))

    data = [[record[column] for column in headers] for record in records]
    headers = [{'header': header} for header in headers]
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS

    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[:num_headers]):
            worksheet7.write(row_index + 2, column_index + 1, cell_value)

    # Add words
    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[num_headers:]):
            word = cell_value.split('*')[1]
            if word in my_context_words:
                worksheet7.write(
                    row_index + 2, column_index + num_headers + 1,
                    cell_value.decode('utf-8'), cyan_format
                )
            else:
                worksheet7.write(
                    row_index + 2, column_index + num_headers + 1, cell_value.decode('utf-8'))

    worksheet7.conditional_format(2, 3, num_topics + 1, 3, {
        'type': 'cell',
        'criteria': '>=',
        'value': 0.1,
        'format': yellow_format})

    worksheet7.add_table(
        1, 1, num_topics + 1, num_headers + Constants.TOPIC_MODEL_STABILITY_NUM_TERMS,
        {'columns': headers})

    # Set widths
    worksheet7.set_column(1, 1, 7)
    worksheet7.set_column(3, 3, 7)
    worksheet7.set_column(4, 4, 8)
    worksheet7.set_column(5, 15, 14)
    workbook.close()

Esempio n. 41

0

Mostra file

File: topic_ensemble_caller.py Progetto: swarnamd/yelp

def get_dataset_file_name():
    return Constants.generate_file_name('topic_ensemble_corpus', '',
                                        CORPUS_FOLDER, None, None, False)[:-1]

Esempio n. 42

0

Mostra file

File: topic_model_analyzer.py Progetto: melqkiades/yelp

def main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1

Esempio n. 43

0

Mostra file

File: topic_model_analyzer.py Progetto: swarnamd/yelp

def main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1

Esempio n. 44

0

Mostra file

import jprops
import pandas

from etl import ETLUtils
from tripadvisor.fourcity import extractor
from utils.constants import Constants

JAVA_COMMAND = 'java'
CARSKIT_JAR = 'CARSKit-v0.3.0.jar'
CARSKIT_ORIGINAL_CONF_FILE = Constants.CARSKIT_FOLDER + 'setting.conf'
# CARSKIT_RATINGS_FOLD_FOLDER = Constants.generate_file_name(
#         'recsys_contextual_records', '', Constants.CACHE_FOLDER + 'rival/',
#         None, None, True, True, normalize_topics=True)[:-1] + '/fold_%d/'
CARSKIT_RATINGS_FOLD_FOLDER = Constants.generate_file_name(
        'recsys_formatted_context_records', '', Constants.CACHE_FOLDER + 'rival/',
        None, None, True, True, uses_carskit=False, normalize_topics=True,
        format_context=True)[:-1] + '/fold_%d/'
CARSKIT_MODIFIED_CONF_FILE = CARSKIT_RATINGS_FOLD_FOLDER + '%s.conf'
OUTPUT_FOLDER = Constants.DATASET_FOLDER + 'carskit_results/'


def run_carskit(fold):

    jar_file = Constants.CARSKIT_FOLDER + 'jar/' + CARSKIT_JAR

    command = [
        JAVA_COMMAND,
        '-jar',
        jar_file,
        '-c',
        CARSKIT_MODIFIED_CONF_FILE % (fold, Constants.CARSKIT_RECOMMENDERS),

Esempio n. 45

0

Mostra file

File: topic_model_analyzer.py Progetto: swarnamd/yelp

def generate_excel_file(records, file_name=None):
    my_context_words = []
    if 'hotel' in Constants.ITEM_TYPE:
        for values in grouped_hotel_context_words.values():
            my_context_words.extend(values)
    elif 'restaurant' in Constants.ITEM_TYPE:
        for values in grouped_restaurant_context_words.values():
            my_context_words.extend(values)

    if file_name is None:
        file_name = Constants.generate_file_name('topic_model', 'xlsx',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, True)
    workbook = xlsxwriter.Workbook(file_name)
    worksheet7 = workbook.add_worksheet()

    yellow_format = workbook.add_format()
    yellow_format.set_pattern(1)  # This is optional when using a solid fill.
    yellow_format.set_bg_color('yellow')

    cyan_format = workbook.add_format()
    cyan_format.set_pattern(1)  # This is optional when using a solid fill.
    cyan_format.set_bg_color('cyan')

    green_format = workbook.add_format()
    green_format.set_pattern(1)  # This is optional when using a solid fill.
    green_format.set_bg_color('green')

    headers = ['topic_id', 'ratio', 'probability_score', 'weighted_frequency']
    num_headers = len(headers)
    for i in range(Constants.TOPIC_MODEL_STABILITY_NUM_TERMS):
        headers.append('word' + str(i))

    data = [[record[column] for column in headers] for record in records]
    headers = [{'header': header} for header in headers]
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS

    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[:num_headers]):
            worksheet7.write(row_index + 2, column_index + 1, cell_value)

    # Add words
    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[num_headers:]):
            word = cell_value.split('*')[1]
            if word in my_context_words:
                worksheet7.write(row_index + 2, column_index + num_headers + 1,
                                 cell_value.decode('utf-8'), cyan_format)
            else:
                worksheet7.write(row_index + 2, column_index + num_headers + 1,
                                 cell_value.decode('utf-8'))

    worksheet7.conditional_format(2, 3, num_topics + 1, 3, {
        'type': 'cell',
        'criteria': '>=',
        'value': 0.1,
        'format': yellow_format
    })

    worksheet7.add_table(
        1, 1, num_topics + 1,
        num_headers + Constants.TOPIC_MODEL_STABILITY_NUM_TERMS,
        {'columns': headers})

    # Set widths
    worksheet7.set_column(1, 1, 7)
    worksheet7.set_column(3, 3, 7)
    worksheet7.set_column(4, 4, 8)
    worksheet7.set_column(5, 15, 14)
    workbook.close()

Esempio n. 46

0

Mostra file

File: topic_ensemble_caller.py Progetto: melqkiades/yelp

def get_dataset_file_name():
    return Constants.generate_file_name(
        'topic_ensemble_corpus', '', CORPUS_FOLDER, None, None, False)[:-1]

Esempio n. 47

0

Mostra file

File: classifier_evaluator.py Progetto: melqkiades/yelp

def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    error_estimation_results = []
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv, SCORE_METRIC).mean()
        error_estimation_results.append(
            {
                'classifier': classifier,
                'accuracy': score,
                Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
            }
        )
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(
        x_matrix, y_vector, PARAM_GRID_MAP[best_classifier], cv, SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)
    csv_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.csv'
    json_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.json'


    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(
    #     csv_file_name, error_estimation_results,
    #     error_estimation_results[0].keys())
    # ETLUtils.save_json_file(json_file_name, error_estimation_results)

    for result in error_estimation_results:
        ETLUtils.write_row_to_csv(
            csv_file_name2, result)
        ETLUtils.write_row_to_json(json_file_name2, result)
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)