コード例 #1
0
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
コード例 #2
0
def manual_main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
コード例 #3
0
ファイル: topic_model_analyzer.py プロジェクト: swarnamd/yelp
def manual_main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
コード例 #4
0
ファイル: topic_model_analyzer.py プロジェクト: swarnamd/yelp
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--numtopics',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
コード例 #5
0
    def count_frequencies(self):
        """
        Counts the number of reviews each user and item have and stores the
        results in two separate files, one for the users and another one for the
        items. Note that the integer IDs are used and not the original user and
        item IDs
        """
        print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        user_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.USER_INTEGER_ID_FIELD)
        item_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.ITEM_INTEGER_ID_FIELD)

        user_frequency_file = Constants.generate_file_name(
            'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )
        item_frequency_file = Constants.generate_file_name(
            'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )

        ETLUtils.save_json_file(user_frequency_file, [user_frequency_map])
        ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])
コード例 #6
0
ファイル: context_transformer.py プロジェクト: swarnamd/yelp
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
コード例 #7
0
def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])
コード例 #8
0
def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])
コード例 #9
0
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None,
        None, False)
    json_file_name = Constants.generate_file_name(
        metric, 'json', Constants.RESULTS_FOLDER, None,
        None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
コード例 #10
0
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
コード例 #11
0
def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(
        prefix, '', folder, None, None, True, True)[:-1]
コード例 #12
0
def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(prefix, '', folder, None, None, True,
                                        True)[:-1]
コード例 #13
0
def load_topic_model(cycle_index, fold_index):
    file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
    print(file_path)
    with open(file_path, 'rb') as read_file:
        topic_model = pickle.load(read_file)
    return topic_model
コード例 #14
0
ファイル: classifier_evaluator.py プロジェクト: swarnamd/yelp
def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv,
                                 SCORE_METRIC).mean()
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(x_matrix, y_vector,
                                     PARAM_GRID_MAP[best_classifier], cv,
                                     SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)

    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(csv_file, results, results[0].keys())
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)
コード例 #15
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
コード例 #16
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None, None,
        False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
コード例 #17
0
def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(
        metric, 'csv', BASE_FOLDER, None, None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend([
        '-o',
        csv_file
    ])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(
        command, stdout=log_file, cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results
コード例 #18
0
    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" % str(document_term_matrix.shape))

        return document_term_matrix
コード例 #19
0
    def separate_recsys_topic_model_records(self):

        print('%s: separate_recsys_topic_model_records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        num_records = len(self.records)
        topic_model_records = self.records[:num_records / 2]

        if not Constants.USE_CONTEXT:
            recsys_records = self.records[num_records / 2:]

            file_name = \
                Constants.generate_file_name(
                    'recsys_contextual_records', 'json', Constants.CACHE_FOLDER,
                    None, None, False, True)

            print('Records without context file: %s' % file_name)

            for record in recsys_records:
                record[Constants.CONTEXT_TOPICS_FIELD] = {'na': 1.0}

            ETLUtils.save_json_file(file_name, recsys_records)
            return

        topic_model_creator.train_topic_model(topic_model_records)

        if os.path.exists(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE):
            print('Recsys topic records have already been generated')
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        else:
            recsys_records = self.records[num_records / 2:]
            self.find_topic_distribution(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE, recsys_records)

        if os.path.exists(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE):
            print('Recsys contextual records have already been generated')
            print(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
        else:
            self.update_context_topics(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE,
                recsys_records
            )

        context_transformer = ContextTransformer(recsys_records)
        context_transformer.load_data()
        context_transformer.transform_records()
        context_transformer.export_records()
コード例 #20
0
def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(metric, 'csv', BASE_FOLDER, None,
                                            None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend(['-o', csv_file])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(command,
                         stdout=log_file,
                         cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results
コード例 #21
0
ファイル: nmf_topic_extractor.py プロジェクト: swarnamd/yelp
    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None,
            False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" %
              str(document_term_matrix.shape))

        return document_term_matrix
コード例 #22
0
ファイル: main.py プロジェクト: swarnamd/yelp
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
コード例 #23
0
ファイル: thesis_charts.py プロジェクト: sarthikadhawan/yelp
def plot_ats_score():
    metric = 'term_difference'
    metric = 'term_stability_pairwise'

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    data_frame = pandas.read_csv(csv_file_name)
    stability_column = 'term_stability_pairwise_mean'
    topic_model_column = 'Topic modeling algorithm'
    num_topics_field = Constants.TOPIC_MODEL_NUM_TOPICS_FIELD

    data_frame.rename(columns={'topic_model_type': topic_model_column},
                      inplace=True)
    data_frame[topic_model_column] = data_frame[topic_model_column].map({
        'lda':
        'LDA',
        'nmf':
        'NMF',
        'ensemble':
        'Ensemble'
    })

    g = seaborn.barplot(x=num_topics_field,
                        y=stability_column,
                        hue=topic_model_column,
                        data=data_frame)
    g.set(xlabel='Number of topics', ylabel='ATS')
    plt.ylim(0, 1.18)
    # g.ylim(10, 40)

    output_folder = Constants.RESULTS_FOLDER + 'pdf/'
    file_name = output_folder + Constants.ITEM_TYPE + '_ats.pdf'
    g.figure.savefig(file_name)
コード例 #24
0
ファイル: libfm_caller.py プロジェクト: sarthikadhawan/yelp
def main():
    print('%s: Making preidctions with LibFM' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    prediction_type_map = {
        'user_test': 'rating',
        'test_items': 'rating',
        'rel_plus_n': 'ranking'
    }
    prediction_type = prediction_type_map[Constants.RIVAL_EVALUATION_STRATEGY]
    use_cache = True

    libfm_ratings_fold_folder = Constants.generate_file_name(
        'recsys_formatted_context_records',
        '',
        Constants.CACHE_FOLDER + 'rival/',
        None,
        None,
        True,
        True,
        uses_carskit=False,
        normalize_topics=True,
        format_context=True)[:-1] + '/fold_%d/'

    for fold in range(Constants.CROSS_VALIDATION_NUM_FOLDS):

        ratings_fold_folder = libfm_ratings_fold_folder % fold
        # ratings_fold_folder = Constants.CACHE_FOLDER + 'rival/contextaa/fold_%d/' % fold
        train_file = ratings_fold_folder + 'libfm_train.libfm'
        predictions_file = ratings_fold_folder + 'libfm_predictions_' + \
                    prediction_type + '.libfm'
        fm_num_factors = Constants.FM_NUM_FACTORS
        results_file = ratings_fold_folder + 'libfm_results_' + \
            prediction_type + '_fmfactors-' + str(fm_num_factors) + '.txt'

        if use_cache and os.path.exists(results_file):
            print("Fold %d file already exists ('%s') " % (fold, results_file))
            continue

        # predictions_file = ratings_fold_folder + 'libfm_test.libfm'
        # results_file = ratings_fold_folder + 'libfm_predictions.txt'
        log_file = ratings_fold_folder + 'libfm_log.txt'
        save_file = ratings_fold_folder + 'libfm_model.txt'

        if not os.path.exists(ratings_fold_folder):
            os.makedirs(ratings_fold_folder)

        run_libfm(train_file, predictions_file, results_file, log_file,
                  save_file)
コード例 #25
0
    def train_topic_model(self, cycle_index, fold_index):

        context_extractor = topic_model_creator.create_topic_model(
            self.train_records, cycle_index, fold_index)
        self.context_rich_topics = context_extractor.context_rich_topics

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)
        ETLUtils.save_json_file(topics_file_path,
                                [dict(self.context_rich_topics)])
        print('Trained Context Extractor: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        return context_extractor
コード例 #26
0
    def load_context_reviews(self, cycle_index, fold_index):

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
        important_records_file_path = Constants.generate_file_name(
            'context_important_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

        self.train_records = ETLUtils.load_json_file(train_records_file_path)
        self.important_records = \
            ETLUtils.load_json_file(important_records_file_path)
        self.load_cache_context_topics(cycle_index, fold_index)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        # self.train_records = self.filter_context_words(self.train_records)
        # self.print_context_topics(self.important_records)

        self.important_records = None
        gc.collect()
コード例 #27
0
ファイル: carskit_caller.py プロジェクト: melqkiades/yelp
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
コード例 #28
0
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
コード例 #29
0
def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths
コード例 #30
0
def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths
コード例 #31
0
ファイル: carskit_caller.py プロジェクト: swarnamd/yelp
def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file
    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)
コード例 #32
0
    def load_cache_context_topics(self, cycle_index, fold_index):

        print('load cache context topics: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)

        self.context_rich_topics = sorted(
            ETLUtils.load_json_file(topics_file_path)[0].items(),
            key=operator.itemgetter(1),
            reverse=True)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]
コード例 #33
0
ファイル: carskit_caller.py プロジェクト: melqkiades/yelp
def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file

    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)
コード例 #34
0
def create_topic_model(records, cycle_index, fold_index, check_exists=True):

    print('%s: Create topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    topic_model_file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

    print(topic_model_file_path)

    if check_exists and os.path.exists(topic_model_file_path):
        print('WARNING: Topic model already exists')
        return load_topic_model(cycle_index, fold_index)

    topic_model = train_context_extractor(records)

    with open(topic_model_file_path, 'wb') as write_file:
        pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    return topic_model
コード例 #35
0
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER,
            None, None, True, True, uses_carskit=False, normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
コード例 #36
0
def train_topic_model(records):
    print('%s: train topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.TOPIC_MODEL_TYPE == 'lda':

        topic_model_file_path = \
            Constants.generate_file_name(
                'topic_model', 'pkl', Constants.CACHE_FOLDER, None, None, True)
        if os.path.exists(topic_model_file_path):
            print('WARNING: Topic model already exists')
            return

        corpus = \
            [record[Constants.CORPUS_FIELD] for record in records]
        dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE)
        topic_model = ldamodel.LdaModel(
            corpus, id2word=dictionary,
            num_topics=Constants.TOPIC_MODEL_NUM_TOPICS,
            passes=Constants.TOPIC_MODEL_PASSES,
            iterations=Constants.TOPIC_MODEL_ITERATIONS)

        with open(topic_model_file_path, 'wb') as write_file:
            pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    elif Constants.TOPIC_MODEL_TYPE == 'ensemble':
        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                    "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

        if os.path.exists(file_path):
            print('Ensemble topic model already exists')
            return

        export_to_text(records)
        topic_ensemble_caller.run_local_parse_directory()
        topic_ensemble_caller.run_generate_kfold()
        topic_ensemble_caller.run_combine_nmf()

    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         Constants.TOPIC_MODEL_TYPE)
コード例 #37
0
ファイル: nmf_topic_extractor.py プロジェクト: swarnamd/yelp
    def load_trained_data(self):

        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
            "factors_final_k%02d.pkl" % self.num_topics
        W, H, doc_ids, terms = load_nmf_factors(file_path)
        self.topic_term_matrix = H
        self.document_topic_matrix = W
        self.terms = terms

        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        tfidf_file_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None,
            False)[:-1] + '_tfidf.pkl'

        self.tfidf_vectorizer = load_tfidf(tfidf_file_path)

        # print('tfidf vectorizer', self.tfidf_vectorizer)

        print "Loaded factor W of size %s and factor H of size %s" % (str(
            self.document_topic_matrix.shape), str(
                self.topic_term_matrix.shape))
コード例 #38
0
    def load_trained_data(self):

        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
            "factors_final_k%02d.pkl" % self.num_topics
        W, H, doc_ids, terms = load_nmf_factors(file_path)
        self.topic_term_matrix = H
        self.document_topic_matrix = W
        self.terms = terms

        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        tfidf_file_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '_tfidf.pkl'

        self.tfidf_vectorizer = load_tfidf(tfidf_file_path)

        # print('tfidf vectorizer', self.tfidf_vectorizer)

        print "Loaded factor W of size %s and factor H of size %s" % (
            str(self.document_topic_matrix.shape),
            str(self.topic_term_matrix.shape)
        )
コード例 #39
0
    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()
コード例 #40
0
def generate_excel_file(records, file_name=None):
    my_context_words = []
    if 'hotel' in Constants.ITEM_TYPE:
        for values in grouped_hotel_context_words.values():
            my_context_words.extend(values)
    elif 'restaurant' in Constants.ITEM_TYPE:
        for values in grouped_restaurant_context_words.values():
            my_context_words.extend(values)

    if file_name is None:
        file_name = Constants.generate_file_name(
            'topic_model', 'xlsx', Constants.RESULTS_FOLDER, None, None, True)
    workbook = xlsxwriter.Workbook(file_name)
    worksheet7 = workbook.add_worksheet()

    yellow_format = workbook.add_format()
    yellow_format.set_pattern(1)  # This is optional when using a solid fill.
    yellow_format.set_bg_color('yellow')

    cyan_format = workbook.add_format()
    cyan_format.set_pattern(1)  # This is optional when using a solid fill.
    cyan_format.set_bg_color('cyan')

    green_format = workbook.add_format()
    green_format.set_pattern(1)  # This is optional when using a solid fill.
    green_format.set_bg_color('green')

    headers = [
        'topic_id',
        'ratio',
        'probability_score',
        'weighted_frequency'
    ]
    num_headers = len(headers)
    for i in range(Constants.TOPIC_MODEL_STABILITY_NUM_TERMS):
        headers.append('word' + str(i))

    data = [[record[column] for column in headers] for record in records]
    headers = [{'header': header} for header in headers]
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS

    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[:num_headers]):
            worksheet7.write(row_index + 2, column_index + 1, cell_value)

    # Add words
    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[num_headers:]):
            word = cell_value.split('*')[1]
            if word in my_context_words:
                worksheet7.write(
                    row_index + 2, column_index + num_headers + 1,
                    cell_value.decode('utf-8'), cyan_format
                )
            else:
                worksheet7.write(
                    row_index + 2, column_index + num_headers + 1, cell_value.decode('utf-8'))

    worksheet7.conditional_format(2, 3, num_topics + 1, 3, {
        'type': 'cell',
        'criteria': '>=',
        'value': 0.1,
        'format': yellow_format})

    worksheet7.add_table(
        1, 1, num_topics + 1, num_headers + Constants.TOPIC_MODEL_STABILITY_NUM_TERMS,
        {'columns': headers})

    # Set widths
    worksheet7.set_column(1, 1, 7)
    worksheet7.set_column(3, 3, 7)
    worksheet7.set_column(4, 4, 8)
    worksheet7.set_column(5, 15, 14)
    workbook.close()
コード例 #41
0
def get_dataset_file_name():
    return Constants.generate_file_name('topic_ensemble_corpus', '',
                                        CORPUS_FOLDER, None, None, False)[:-1]
コード例 #42
0
def main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
コード例 #43
0
ファイル: topic_model_analyzer.py プロジェクト: swarnamd/yelp
def main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
コード例 #44
0
import jprops
import pandas

from etl import ETLUtils
from tripadvisor.fourcity import extractor
from utils.constants import Constants

JAVA_COMMAND = 'java'
CARSKIT_JAR = 'CARSKit-v0.3.0.jar'
CARSKIT_ORIGINAL_CONF_FILE = Constants.CARSKIT_FOLDER + 'setting.conf'
# CARSKIT_RATINGS_FOLD_FOLDER = Constants.generate_file_name(
#         'recsys_contextual_records', '', Constants.CACHE_FOLDER + 'rival/',
#         None, None, True, True, normalize_topics=True)[:-1] + '/fold_%d/'
CARSKIT_RATINGS_FOLD_FOLDER = Constants.generate_file_name(
        'recsys_formatted_context_records', '', Constants.CACHE_FOLDER + 'rival/',
        None, None, True, True, uses_carskit=False, normalize_topics=True,
        format_context=True)[:-1] + '/fold_%d/'
CARSKIT_MODIFIED_CONF_FILE = CARSKIT_RATINGS_FOLD_FOLDER + '%s.conf'
OUTPUT_FOLDER = Constants.DATASET_FOLDER + 'carskit_results/'


def run_carskit(fold):

    jar_file = Constants.CARSKIT_FOLDER + 'jar/' + CARSKIT_JAR

    command = [
        JAVA_COMMAND,
        '-jar',
        jar_file,
        '-c',
        CARSKIT_MODIFIED_CONF_FILE % (fold, Constants.CARSKIT_RECOMMENDERS),
コード例 #45
0
ファイル: topic_model_analyzer.py プロジェクト: swarnamd/yelp
def generate_excel_file(records, file_name=None):
    my_context_words = []
    if 'hotel' in Constants.ITEM_TYPE:
        for values in grouped_hotel_context_words.values():
            my_context_words.extend(values)
    elif 'restaurant' in Constants.ITEM_TYPE:
        for values in grouped_restaurant_context_words.values():
            my_context_words.extend(values)

    if file_name is None:
        file_name = Constants.generate_file_name('topic_model', 'xlsx',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, True)
    workbook = xlsxwriter.Workbook(file_name)
    worksheet7 = workbook.add_worksheet()

    yellow_format = workbook.add_format()
    yellow_format.set_pattern(1)  # This is optional when using a solid fill.
    yellow_format.set_bg_color('yellow')

    cyan_format = workbook.add_format()
    cyan_format.set_pattern(1)  # This is optional when using a solid fill.
    cyan_format.set_bg_color('cyan')

    green_format = workbook.add_format()
    green_format.set_pattern(1)  # This is optional when using a solid fill.
    green_format.set_bg_color('green')

    headers = ['topic_id', 'ratio', 'probability_score', 'weighted_frequency']
    num_headers = len(headers)
    for i in range(Constants.TOPIC_MODEL_STABILITY_NUM_TERMS):
        headers.append('word' + str(i))

    data = [[record[column] for column in headers] for record in records]
    headers = [{'header': header} for header in headers]
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS

    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[:num_headers]):
            worksheet7.write(row_index + 2, column_index + 1, cell_value)

    # Add words
    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[num_headers:]):
            word = cell_value.split('*')[1]
            if word in my_context_words:
                worksheet7.write(row_index + 2, column_index + num_headers + 1,
                                 cell_value.decode('utf-8'), cyan_format)
            else:
                worksheet7.write(row_index + 2, column_index + num_headers + 1,
                                 cell_value.decode('utf-8'))

    worksheet7.conditional_format(2, 3, num_topics + 1, 3, {
        'type': 'cell',
        'criteria': '>=',
        'value': 0.1,
        'format': yellow_format
    })

    worksheet7.add_table(
        1, 1, num_topics + 1,
        num_headers + Constants.TOPIC_MODEL_STABILITY_NUM_TERMS,
        {'columns': headers})

    # Set widths
    worksheet7.set_column(1, 1, 7)
    worksheet7.set_column(3, 3, 7)
    worksheet7.set_column(4, 4, 8)
    worksheet7.set_column(5, 15, 14)
    workbook.close()
コード例 #46
0
def get_dataset_file_name():
    return Constants.generate_file_name(
        'topic_ensemble_corpus', '', CORPUS_FOLDER, None, None, False)[:-1]
コード例 #47
0
def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    error_estimation_results = []
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv, SCORE_METRIC).mean()
        error_estimation_results.append(
            {
                'classifier': classifier,
                'accuracy': score,
                Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
            }
        )
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(
        x_matrix, y_vector, PARAM_GRID_MAP[best_classifier], cv, SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)
    csv_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.csv'
    json_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.json'


    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(
    #     csv_file_name, error_estimation_results,
    #     error_estimation_results[0].keys())
    # ETLUtils.save_json_file(json_file_name, error_estimation_results)

    for result in error_estimation_results:
        ETLUtils.write_row_to_csv(
            csv_file_name2, result)
        ETLUtils.write_row_to_json(json_file_name2, result)
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)