Esempio n. 1
0
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
Esempio n. 2
0
def manual_main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
Esempio n. 3
0
def manual_main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    num_topics_list = [Constants.TOPIC_MODEL_NUM_TOPICS]
    num_cycles = len(num_topics_list)
    cycle_index = 1
    for num_topics in num_topics_list:
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=False))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
Esempio n. 4
0
def cli_main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        '--numtopics',
                        metavar='int',
                        type=int,
                        nargs=1,
                        help='The number of topics of the topic model')

    args = parser.parse_args()
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    results = Constants.get_properties_copy()
    results.update(analyze_topics(include_stability=True))

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    write_results_to_csv(csv_file_name, results)
    write_results_to_json(json_file_name, results)
Esempio n. 5
0
    def count_frequencies(self):
        """
        Counts the number of reviews each user and item have and stores the
        results in two separate files, one for the users and another one for the
        items. Note that the integer IDs are used and not the original user and
        item IDs
        """
        print('%s: count frequencies' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        user_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.USER_INTEGER_ID_FIELD)
        item_frequency_map = ETLUtils.count_frequency(
            self.records, Constants.ITEM_INTEGER_ID_FIELD)

        user_frequency_file = Constants.generate_file_name(
            'user_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )
        item_frequency_file = Constants.generate_file_name(
            'item_frequency_map', 'json', Constants.CACHE_FOLDER, None, None,
            False
        )

        ETLUtils.save_json_file(user_frequency_file, [user_frequency_map])
        ETLUtils.save_json_file(item_frequency_file, [item_frequency_map])
Esempio n. 6
0
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
Esempio n. 7
0
def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])
Esempio n. 8
0
def load_pipeline():

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)

    if not os.path.exists(best_hyperparams_file_name):
        print('Recsys contextual records have already been generated')
        full_cycle()

    with open(best_hyperparams_file_name, 'r') as json_file:
        file_contents = json_file.read()
        parameters = json.loads(file_contents)

        print(parameters)

        classifiers = {
            'logisticregression': LogisticRegression(),
            'svc': SVC(),
            'kneighborsclassifier': KNeighborsClassifier(),
            'decisiontreeclassifier': DecisionTreeClassifier(),
            'nusvc': NuSVC(),
            'randomforestclassifier': RandomForestClassifier()
        }

        classifier = classifiers[parameters['classifier'].lower()]
        # print(classifier)
        classifier_params = get_classifier_params(parameters)
        classifier.set_params(**classifier_params)
        print(classifier)

        resampler = sampler_factory.create_sampler(
            parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)

        return Pipeline([('resampler', resampler), ('classifier', classifier)])
Esempio n. 9
0
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None,
        None, False)
    json_file_name = Constants.generate_file_name(
        metric, 'json', Constants.RESULTS_FOLDER, None,
        None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
Esempio n. 10
0
def full_cycle(metric):
    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(json_file_name)
    print(csv_file_name)

    properties = Constants.get_properties_copy()
    results = evaluate_topic_model(metric)
    print(results)
    results.update(properties)

    ETLUtils.write_row_to_csv(csv_file_name, results)
    ETLUtils.write_row_to_json(json_file_name, results)
Esempio n. 11
0
def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(
        prefix, '', folder, None, None, True, True)[:-1]
Esempio n. 12
0
def get_topic_model_prefix(folder='', seed=None):

    prefix = 'topic_model'
    if seed is not None:
        prefix += '_seed-' + str(seed)

    return Constants.generate_file_name(prefix, '', folder, None, None, True,
                                        True)[:-1]
Esempio n. 13
0
def load_topic_model(cycle_index, fold_index):
    file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
    print(file_path)
    with open(file_path, 'rb') as read_file:
        topic_model = pickle.load(read_file)
    return topic_model
Esempio n. 14
0
def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv,
                                 SCORE_METRIC).mean()
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(x_matrix, y_vector,
                                     PARAM_GRID_MAP[best_classifier], cv,
                                     SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)

    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(csv_file, results, results[0].keys())
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None, None,
        False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)
Esempio n. 15
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
Esempio n. 16
0
def cycle_eval_topic_model(metric, num_topics_list):

    csv_file_name = Constants.generate_file_name(
        metric, 'csv', Constants.RESULTS_FOLDER, None, None,
        False)

    for topic in num_topics_list:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: topic})
        results = run_eval_topic_model(metric)
        topic_model_analyzer.write_results_to_csv(csv_file_name, results)
Esempio n. 17
0
def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(
        metric, 'csv', BASE_FOLDER, None, None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend([
        '-o',
        csv_file
    ])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(
        command, stdout=log_file, cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results
Esempio n. 18
0
    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" % str(document_term_matrix.shape))

        return document_term_matrix
Esempio n. 19
0
    def separate_recsys_topic_model_records(self):

        print('%s: separate_recsys_topic_model_records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        num_records = len(self.records)
        topic_model_records = self.records[:num_records / 2]

        if not Constants.USE_CONTEXT:
            recsys_records = self.records[num_records / 2:]

            file_name = \
                Constants.generate_file_name(
                    'recsys_contextual_records', 'json', Constants.CACHE_FOLDER,
                    None, None, False, True)

            print('Records without context file: %s' % file_name)

            for record in recsys_records:
                record[Constants.CONTEXT_TOPICS_FIELD] = {'na': 1.0}

            ETLUtils.save_json_file(file_name, recsys_records)
            return

        topic_model_creator.train_topic_model(topic_model_records)

        if os.path.exists(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE):
            print('Recsys topic records have already been generated')
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        else:
            recsys_records = self.records[num_records / 2:]
            self.find_topic_distribution(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE, recsys_records)

        if os.path.exists(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE):
            print('Recsys contextual records have already been generated')
            print(Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
            recsys_records = ETLUtils.load_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
        else:
            self.update_context_topics(recsys_records)
            ETLUtils.save_json_file(
                Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE,
                recsys_records
            )

        context_transformer = ContextTransformer(recsys_records)
        context_transformer.load_data()
        context_transformer.transform_records()
        context_transformer.export_records()
Esempio n. 20
0
def run_eval_topic_model(metric):

    parse_directory_command = Constants.TOPIC_ENSEMBLE_FOLDER + \
        'eval-' + metric.replace('_', '-') + '.py'

    csv_file = Constants.generate_file_name(metric, 'csv', BASE_FOLDER, None,
                                            None, True, True)

    dataset_file_name = Constants.generate_file_name(
        'topic_model', '', BASE_FOLDER, None, None, True, True)[:-1] +\
        '/ranks*.pkl'
    topic_model_files = glob.glob(dataset_file_name)

    command = [
        PYTHON_COMMAND,
        parse_directory_command,
    ]
    command.extend(topic_model_files)
    command.extend(['-o', csv_file])

    print(' '.join(command))

    unique_id = uuid.uuid4().hex
    log_file_name = Constants.GENERATED_FOLDER + Constants.ITEM_TYPE + '_' + \
        Constants.TOPIC_MODEL_TARGET_REVIEWS + '_' + metric + '_' +\
        unique_id + '.log'
    #
    log_file = open(log_file_name, "w")
    p = subprocess.Popen(command,
                         stdout=log_file,
                         cwd=Constants.TOPIC_ENSEMBLE_FOLDER)
    p.wait()

    results = read_csv_first_column_as_key(csv_file, metric)
    results[Constants.TOPIC_MODEL_NUM_TOPICS_FIELD] =\
        Constants.TOPIC_MODEL_NUM_TOPICS
    results[Constants.TOPIC_MODEL_TYPE_FIELD] = Constants.TOPIC_MODEL_TYPE

    return results
Esempio n. 21
0
    def load_document_term_matrix():
        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        corpus_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None,
            False)[:-1] + '.pkl'

        document_term_matrix, _, _, _ = load_corpus(corpus_path)

        print("Loaded document-term matrix of size %s" %
              str(document_term_matrix.shape))

        return document_term_matrix
Esempio n. 22
0
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
Esempio n. 23
0
def plot_ats_score():
    metric = 'term_difference'
    metric = 'term_stability_pairwise'

    csv_file_name = Constants.generate_file_name(metric, 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name(metric, 'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)

    data_frame = pandas.read_csv(csv_file_name)
    stability_column = 'term_stability_pairwise_mean'
    topic_model_column = 'Topic modeling algorithm'
    num_topics_field = Constants.TOPIC_MODEL_NUM_TOPICS_FIELD

    data_frame.rename(columns={'topic_model_type': topic_model_column},
                      inplace=True)
    data_frame[topic_model_column] = data_frame[topic_model_column].map({
        'lda':
        'LDA',
        'nmf':
        'NMF',
        'ensemble':
        'Ensemble'
    })

    g = seaborn.barplot(x=num_topics_field,
                        y=stability_column,
                        hue=topic_model_column,
                        data=data_frame)
    g.set(xlabel='Number of topics', ylabel='ATS')
    plt.ylim(0, 1.18)
    # g.ylim(10, 40)

    output_folder = Constants.RESULTS_FOLDER + 'pdf/'
    file_name = output_folder + Constants.ITEM_TYPE + '_ats.pdf'
    g.figure.savefig(file_name)
Esempio n. 24
0
def main():
    print('%s: Making preidctions with LibFM' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    prediction_type_map = {
        'user_test': 'rating',
        'test_items': 'rating',
        'rel_plus_n': 'ranking'
    }
    prediction_type = prediction_type_map[Constants.RIVAL_EVALUATION_STRATEGY]
    use_cache = True

    libfm_ratings_fold_folder = Constants.generate_file_name(
        'recsys_formatted_context_records',
        '',
        Constants.CACHE_FOLDER + 'rival/',
        None,
        None,
        True,
        True,
        uses_carskit=False,
        normalize_topics=True,
        format_context=True)[:-1] + '/fold_%d/'

    for fold in range(Constants.CROSS_VALIDATION_NUM_FOLDS):

        ratings_fold_folder = libfm_ratings_fold_folder % fold
        # ratings_fold_folder = Constants.CACHE_FOLDER + 'rival/contextaa/fold_%d/' % fold
        train_file = ratings_fold_folder + 'libfm_train.libfm'
        predictions_file = ratings_fold_folder + 'libfm_predictions_' + \
                    prediction_type + '.libfm'
        fm_num_factors = Constants.FM_NUM_FACTORS
        results_file = ratings_fold_folder + 'libfm_results_' + \
            prediction_type + '_fmfactors-' + str(fm_num_factors) + '.txt'

        if use_cache and os.path.exists(results_file):
            print("Fold %d file already exists ('%s') " % (fold, results_file))
            continue

        # predictions_file = ratings_fold_folder + 'libfm_test.libfm'
        # results_file = ratings_fold_folder + 'libfm_predictions.txt'
        log_file = ratings_fold_folder + 'libfm_log.txt'
        save_file = ratings_fold_folder + 'libfm_model.txt'

        if not os.path.exists(ratings_fold_folder):
            os.makedirs(ratings_fold_folder)

        run_libfm(train_file, predictions_file, results_file, log_file,
                  save_file)
Esempio n. 25
0
    def train_topic_model(self, cycle_index, fold_index):

        context_extractor = topic_model_creator.create_topic_model(
            self.train_records, cycle_index, fold_index)
        self.context_rich_topics = context_extractor.context_rich_topics

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)
        ETLUtils.save_json_file(topics_file_path,
                                [dict(self.context_rich_topics)])
        print('Trained Context Extractor: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        return context_extractor
Esempio n. 26
0
    def load_context_reviews(self, cycle_index, fold_index):

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
        important_records_file_path = Constants.generate_file_name(
            'context_important_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

        self.train_records = ETLUtils.load_json_file(train_records_file_path)
        self.important_records = \
            ETLUtils.load_json_file(important_records_file_path)
        self.load_cache_context_topics(cycle_index, fold_index)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        # self.train_records = self.filter_context_words(self.train_records)
        # self.print_context_topics(self.important_records)

        self.important_records = None
        gc.collect()
Esempio n. 27
0
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
Esempio n. 28
0
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
Esempio n. 29
0
def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths
Esempio n. 30
0
def get_topic_ensemble_ranks_file_paths():

    num_models = Constants.TOPIC_MODEL_STABILITY_ITERATIONS
    random_seeds = range(1, num_models + 1)

    suffix = 'ranks_ensemble_k%02d.pkl' % Constants.TOPIC_MODEL_NUM_TOPICS

    file_paths = []

    for seed in random_seeds:
        prefix = 'topic_model_seed-' + str(seed)
        topic_model_folder = Constants.generate_file_name(
            prefix, '', Constants.ENSEMBLE_FOLDER, None, None, True, True)[:-1]
        topic_model_file = topic_model_folder + '/' + suffix
        file_paths.append(topic_model_file)

    return file_paths
Esempio n. 31
0
def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file
    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)
Esempio n. 32
0
    def load_cache_context_topics(self, cycle_index, fold_index):

        print('load cache context topics: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)

        self.context_rich_topics = sorted(
            ETLUtils.load_json_file(topics_file_path)[0].items(),
            key=operator.itemgetter(1),
            reverse=True)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]
Esempio n. 33
0
def save_results(results):

    # Take the results given by the run_carskit function and extend them with
    # the Constants.get_properties() dictionary, then save them to a CSV file

    """

    :type results: list[dict]
    :param results:
    """
    properties = Constants.get_properties_copy()

    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)

    for result in results:
        result.update(properties)
        write_results_to_json(json_file, result)
Esempio n. 34
0
def create_topic_model(records, cycle_index, fold_index, check_exists=True):

    print('%s: Create topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    topic_model_file_path = \
        Constants.generate_file_name(
            'topic_model', 'pkl', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

    print(topic_model_file_path)

    if check_exists and os.path.exists(topic_model_file_path):
        print('WARNING: Topic model already exists')
        return load_topic_model(cycle_index, fold_index)

    topic_model = train_context_extractor(records)

    with open(topic_model_file_path, 'wb') as write_file:
        pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    return topic_model
Esempio n. 35
0
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER,
            None, None, True, True, uses_carskit=False, normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
Esempio n. 36
0
def train_topic_model(records):
    print('%s: train topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.TOPIC_MODEL_TYPE == 'lda':

        topic_model_file_path = \
            Constants.generate_file_name(
                'topic_model', 'pkl', Constants.CACHE_FOLDER, None, None, True)
        if os.path.exists(topic_model_file_path):
            print('WARNING: Topic model already exists')
            return

        corpus = \
            [record[Constants.CORPUS_FIELD] for record in records]
        dictionary = corpora.Dictionary.load(Constants.DICTIONARY_FILE)
        topic_model = ldamodel.LdaModel(
            corpus, id2word=dictionary,
            num_topics=Constants.TOPIC_MODEL_NUM_TOPICS,
            passes=Constants.TOPIC_MODEL_PASSES,
            iterations=Constants.TOPIC_MODEL_ITERATIONS)

        with open(topic_model_file_path, 'wb') as write_file:
            pickle.dump(topic_model, write_file, pickle.HIGHEST_PROTOCOL)

    elif Constants.TOPIC_MODEL_TYPE == 'ensemble':
        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
                    "factors_final_k%02d.pkl" % Constants.TOPIC_MODEL_NUM_TOPICS

        if os.path.exists(file_path):
            print('Ensemble topic model already exists')
            return

        export_to_text(records)
        topic_ensemble_caller.run_local_parse_directory()
        topic_ensemble_caller.run_generate_kfold()
        topic_ensemble_caller.run_combine_nmf()

    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         Constants.TOPIC_MODEL_TYPE)
Esempio n. 37
0
    def load_trained_data(self):

        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
            "factors_final_k%02d.pkl" % self.num_topics
        W, H, doc_ids, terms = load_nmf_factors(file_path)
        self.topic_term_matrix = H
        self.document_topic_matrix = W
        self.terms = terms

        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        tfidf_file_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder, None, None,
            False)[:-1] + '_tfidf.pkl'

        self.tfidf_vectorizer = load_tfidf(tfidf_file_path)

        # print('tfidf vectorizer', self.tfidf_vectorizer)

        print "Loaded factor W of size %s and factor H of size %s" % (str(
            self.document_topic_matrix.shape), str(
                self.topic_term_matrix.shape))
Esempio n. 38
0
    def load_trained_data(self):

        file_path = Constants.ENSEMBLED_RESULTS_FOLDER + \
            "factors_final_k%02d.pkl" % self.num_topics
        W, H, doc_ids, terms = load_nmf_factors(file_path)
        self.topic_term_matrix = H
        self.document_topic_matrix = W
        self.terms = terms

        topic_model_corpus_folder = \
            Constants.CACHE_FOLDER + 'topic_models/corpus/'
        tfidf_file_path = Constants.generate_file_name(
            'topic_ensemble_corpus', '', topic_model_corpus_folder,
            None, None, False)[:-1] + '_tfidf.pkl'

        self.tfidf_vectorizer = load_tfidf(tfidf_file_path)

        # print('tfidf vectorizer', self.tfidf_vectorizer)

        print "Loaded factor W of size %s and factor H of size %s" % (
            str(self.document_topic_matrix.shape),
            str(self.topic_term_matrix.shape)
        )
Esempio n. 39
0
    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()
Esempio n. 40
0
def generate_excel_file(records, file_name=None):
    my_context_words = []
    if 'hotel' in Constants.ITEM_TYPE:
        for values in grouped_hotel_context_words.values():
            my_context_words.extend(values)
    elif 'restaurant' in Constants.ITEM_TYPE:
        for values in grouped_restaurant_context_words.values():
            my_context_words.extend(values)

    if file_name is None:
        file_name = Constants.generate_file_name(
            'topic_model', 'xlsx', Constants.RESULTS_FOLDER, None, None, True)
    workbook = xlsxwriter.Workbook(file_name)
    worksheet7 = workbook.add_worksheet()

    yellow_format = workbook.add_format()
    yellow_format.set_pattern(1)  # This is optional when using a solid fill.
    yellow_format.set_bg_color('yellow')

    cyan_format = workbook.add_format()
    cyan_format.set_pattern(1)  # This is optional when using a solid fill.
    cyan_format.set_bg_color('cyan')

    green_format = workbook.add_format()
    green_format.set_pattern(1)  # This is optional when using a solid fill.
    green_format.set_bg_color('green')

    headers = [
        'topic_id',
        'ratio',
        'probability_score',
        'weighted_frequency'
    ]
    num_headers = len(headers)
    for i in range(Constants.TOPIC_MODEL_STABILITY_NUM_TERMS):
        headers.append('word' + str(i))

    data = [[record[column] for column in headers] for record in records]
    headers = [{'header': header} for header in headers]
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS

    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[:num_headers]):
            worksheet7.write(row_index + 2, column_index + 1, cell_value)

    # Add words
    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[num_headers:]):
            word = cell_value.split('*')[1]
            if word in my_context_words:
                worksheet7.write(
                    row_index + 2, column_index + num_headers + 1,
                    cell_value.decode('utf-8'), cyan_format
                )
            else:
                worksheet7.write(
                    row_index + 2, column_index + num_headers + 1, cell_value.decode('utf-8'))

    worksheet7.conditional_format(2, 3, num_topics + 1, 3, {
        'type': 'cell',
        'criteria': '>=',
        'value': 0.1,
        'format': yellow_format})

    worksheet7.add_table(
        1, 1, num_topics + 1, num_headers + Constants.TOPIC_MODEL_STABILITY_NUM_TERMS,
        {'columns': headers})

    # Set widths
    worksheet7.set_column(1, 1, 7)
    worksheet7.set_column(3, 3, 7)
    worksheet7.set_column(4, 4, 8)
    worksheet7.set_column(5, 15, 14)
    workbook.close()
Esempio n. 41
0
def get_dataset_file_name():
    return Constants.generate_file_name('topic_ensemble_corpus', '',
                                        CORPUS_FOLDER, None, None, False)[:-1]
Esempio n. 42
0
def main():

    csv_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'csv', Constants.RESULTS_FOLDER, None, None,
        False)
    json_file_name = Constants.generate_file_name(
        'topic_model_analysis', 'json', Constants.RESULTS_FOLDER, None, None,
        False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
Esempio n. 43
0
def main():

    csv_file_name = Constants.generate_file_name('topic_model_analysis', 'csv',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, False)
    json_file_name = Constants.generate_file_name('topic_model_analysis',
                                                  'json',
                                                  Constants.RESULTS_FOLDER,
                                                  None, None, False)
    print(csv_file_name)

    # export_lda_topics(0, 0)
    # epsilon_list = [0.001, 0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.35, 0.5]
    epsilon_list = [0.05]
    alpha_list = [0.0]
    # num_topics_list =\
    #     [5, 10, 35, 50, 75, 100, 150, 200, 300, 400, 500, 600, 700, 800]
    # num_topics_list = [10, 20, 30, 50, 75, 100, 150, 300]
    # num_topics_list = [150, 300]
    num_topics_list = range(1, 51)
    bow_type_list = ['NN']
    # document_level_list = ['review', 'sentence', 1]
    document_level_list = [1]
    # topic_weighting_methods = ['binary', 'probability']
    topic_weighting_methods = ['probability']
    # review_type_list = ['specific', 'generic', 'all_reviews']
    review_type_list = ['specific']
    # lda_passes_list = [1, 10, 20, 50, 75, 100, 200, 500]
    # lda_passes_list = [1, 10]
    lda_passes_list = [100]
    # lda_iterations_list = [50, 100, 200, 400, 800, 2000]
    # lda_iterations_list = [50, 100, 200, 500]
    lda_iterations_list = [200]
    # topic_model_type_list = ['lda', 'nmf']
    topic_model_type_list = ['nmf']
    num_cycles = len(epsilon_list) * len(alpha_list) * len(num_topics_list) *\
        len(document_level_list) * len(topic_weighting_methods) *\
        len(review_type_list) * len(lda_passes_list) *\
        len(lda_iterations_list) * len(topic_model_type_list) *\
        len(bow_type_list)
    cycle_index = 1
    for epsilon, alpha, num_topics, document_level, topic_weighting_method,\
        review_type, lda_passes, lda_iterations, topic_model_type,\
        bow_type in itertools.product(
            epsilon_list, alpha_list, num_topics_list, document_level_list,
            topic_weighting_methods, review_type_list, lda_passes_list,
            lda_iterations_list, topic_model_type_list, bow_type_list):
        print('\ncycle_index: %d/%d' % (cycle_index, num_cycles))
        new_dict = {
            Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics,
            Constants.DOCUMENT_LEVEL_FIELD: document_level,
            Constants.TOPIC_WEIGHTING_METHOD_FIELD: topic_weighting_method,
            Constants.CONTEXT_EXTRACTOR_ALPHA_FIELD: alpha,
            Constants.CONTEXT_EXTRACTOR_EPSILON_FIELD: epsilon,
            Constants.TOPIC_MODEL_REVIEW_TYPE_FIELD: review_type,
            Constants.TOPIC_MODEL_PASSES_FIELD: lda_passes,
            Constants.TOPIC_MODEL_ITERATIONS_FIELD: lda_iterations,
            Constants.TOPIC_MODEL_TYPE_FIELD: topic_model_type,
            Constants.BOW_TYPE_FIELD: bow_type
        }

        print(new_dict)

        Constants.update_properties(new_dict)
        results = Constants.get_properties_copy()
        results.update(analyze_topics(include_stability=True))

        write_results_to_csv(csv_file_name, results)
        write_results_to_json(json_file_name, results)

        cycle_index += 1
Esempio n. 44
0
import jprops
import pandas

from etl import ETLUtils
from tripadvisor.fourcity import extractor
from utils.constants import Constants

JAVA_COMMAND = 'java'
CARSKIT_JAR = 'CARSKit-v0.3.0.jar'
CARSKIT_ORIGINAL_CONF_FILE = Constants.CARSKIT_FOLDER + 'setting.conf'
# CARSKIT_RATINGS_FOLD_FOLDER = Constants.generate_file_name(
#         'recsys_contextual_records', '', Constants.CACHE_FOLDER + 'rival/',
#         None, None, True, True, normalize_topics=True)[:-1] + '/fold_%d/'
CARSKIT_RATINGS_FOLD_FOLDER = Constants.generate_file_name(
        'recsys_formatted_context_records', '', Constants.CACHE_FOLDER + 'rival/',
        None, None, True, True, uses_carskit=False, normalize_topics=True,
        format_context=True)[:-1] + '/fold_%d/'
CARSKIT_MODIFIED_CONF_FILE = CARSKIT_RATINGS_FOLD_FOLDER + '%s.conf'
OUTPUT_FOLDER = Constants.DATASET_FOLDER + 'carskit_results/'


def run_carskit(fold):

    jar_file = Constants.CARSKIT_FOLDER + 'jar/' + CARSKIT_JAR

    command = [
        JAVA_COMMAND,
        '-jar',
        jar_file,
        '-c',
        CARSKIT_MODIFIED_CONF_FILE % (fold, Constants.CARSKIT_RECOMMENDERS),
Esempio n. 45
0
def generate_excel_file(records, file_name=None):
    my_context_words = []
    if 'hotel' in Constants.ITEM_TYPE:
        for values in grouped_hotel_context_words.values():
            my_context_words.extend(values)
    elif 'restaurant' in Constants.ITEM_TYPE:
        for values in grouped_restaurant_context_words.values():
            my_context_words.extend(values)

    if file_name is None:
        file_name = Constants.generate_file_name('topic_model', 'xlsx',
                                                 Constants.RESULTS_FOLDER,
                                                 None, None, True)
    workbook = xlsxwriter.Workbook(file_name)
    worksheet7 = workbook.add_worksheet()

    yellow_format = workbook.add_format()
    yellow_format.set_pattern(1)  # This is optional when using a solid fill.
    yellow_format.set_bg_color('yellow')

    cyan_format = workbook.add_format()
    cyan_format.set_pattern(1)  # This is optional when using a solid fill.
    cyan_format.set_bg_color('cyan')

    green_format = workbook.add_format()
    green_format.set_pattern(1)  # This is optional when using a solid fill.
    green_format.set_bg_color('green')

    headers = ['topic_id', 'ratio', 'probability_score', 'weighted_frequency']
    num_headers = len(headers)
    for i in range(Constants.TOPIC_MODEL_STABILITY_NUM_TERMS):
        headers.append('word' + str(i))

    data = [[record[column] for column in headers] for record in records]
    headers = [{'header': header} for header in headers]
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS

    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[:num_headers]):
            worksheet7.write(row_index + 2, column_index + 1, cell_value)

    # Add words
    for row_index, row_data in enumerate(data):
        for column_index, cell_value in enumerate(row_data[num_headers:]):
            word = cell_value.split('*')[1]
            if word in my_context_words:
                worksheet7.write(row_index + 2, column_index + num_headers + 1,
                                 cell_value.decode('utf-8'), cyan_format)
            else:
                worksheet7.write(row_index + 2, column_index + num_headers + 1,
                                 cell_value.decode('utf-8'))

    worksheet7.conditional_format(2, 3, num_topics + 1, 3, {
        'type': 'cell',
        'criteria': '>=',
        'value': 0.1,
        'format': yellow_format
    })

    worksheet7.add_table(
        1, 1, num_topics + 1,
        num_headers + Constants.TOPIC_MODEL_STABILITY_NUM_TERMS,
        {'columns': headers})

    # Set widths
    worksheet7.set_column(1, 1, 7)
    worksheet7.set_column(3, 3, 7)
    worksheet7.set_column(4, 4, 8)
    worksheet7.set_column(5, 15, 14)
    workbook.close()
Esempio n. 46
0
def get_dataset_file_name():
    return Constants.generate_file_name(
        'topic_ensemble_corpus', '', CORPUS_FOLDER, None, None, False)[:-1]
Esempio n. 47
0
def full_cycle():

    plant_random_seeds()
    my_records = load_records()
    preprocess_records(my_records)
    x_matrix, y_vector = transform(my_records)
    count_specific_generic(my_records)

    # Error estimation
    error_estimation_results = []
    best_classifier = None
    best_score = 0.0
    for classifier, params in PARAM_GRID_MAP.items():
        # print('Classifier: %s' % classifier)
        cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
        score = error_estimation(x_matrix, y_vector, params, cv, SCORE_METRIC).mean()
        error_estimation_results.append(
            {
                'classifier': classifier,
                'accuracy': score,
                Constants.BUSINESS_TYPE_FIELD: Constants.ITEM_TYPE
            }
        )
        print('%s score: %f' % (classifier, score))

        if score > best_score:
            best_score = score
            best_classifier = classifier

    # Model selection
    cv = StratifiedKFold(Constants.CROSS_VALIDATION_NUM_FOLDS)
    grid_search_cv = model_selection(
        x_matrix, y_vector, PARAM_GRID_MAP[best_classifier], cv, SCORE_METRIC)
    # best_model = grid_search_cv.best_estimator_.get_params()['classifier']
    # features_importance = best_model.coef_
    print('%s: %f' % (SCORE_METRIC, grid_search_cv.best_score_))
    print('best params', grid_search_cv.best_params_)

    # for key, value in grid_search_cv.best_params_.items():
    #     print(key, value)

    # print('best estimator', grid_search_cv.best_estimator_)
    # print('features importance', features_importance)

    # csv_file_name = Constants.generate_file_name(
    #     'classifier_results', 'csv', Constants.RESULTS_FOLDER, None,
    #     None, False)
    # json_file_name = Constants.generate_file_name(
    #     'classifier_results', 'json', Constants.RESULTS_FOLDER, None,
    #     None, False)
    csv_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.csv'
    json_file_name2 = Constants.RESULTS_FOLDER + 'classifier_results.json'


    # results = get_scores(final_grid_search_cv.cv_results_)
    # csv_file = '/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_new_reviews_classifier_results.csv'
    # ETLUtils.save_csv_file(
    #     csv_file_name, error_estimation_results,
    #     error_estimation_results[0].keys())
    # ETLUtils.save_json_file(json_file_name, error_estimation_results)

    for result in error_estimation_results:
        ETLUtils.write_row_to_csv(
            csv_file_name2, result)
        ETLUtils.write_row_to_json(json_file_name2, result)
    #
    # print(csv_file)

    best_hyperparams_file_name = Constants.generate_file_name(
        'best_hyperparameters', 'json', Constants.CACHE_FOLDER, None,
        None, False)
    save_parameters(best_hyperparams_file_name, grid_search_cv.best_params_)