Beispiel #1
0
    def prepare(self):
        print('prepare: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))

        contextual_train_set =\
            ETLUtils.select_fields(self.headers, self.train_records)
        contextual_test_set =\
            ETLUtils.select_fields(self.headers, self.records_to_predict)

        ETLUtils.save_csv_file(
            self.csv_train_file, contextual_train_set, self.headers)
        ETLUtils.save_csv_file(
            self.csv_test_file, contextual_test_set, self.headers)

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        num_cols = len(self.headers)
        context_cols = num_cols
        print('num_cols', num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ',', has_header=True,
            suffix='.no_context.libfm')
        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.context.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
Beispiel #2
0
    def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):

        contextual_train_set, contextual_test_set = self.full_cycle(
            train_records, test_records, train_reviews, test_reviews
        )

        print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
        csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
        # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
        csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"

        # ETLUtils.save_json_file(json_train_file, contextual_train_set)
        ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)

        # ETLUtils.save_json_file(json_test_file, contextual_test_set)
        ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)

        print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [csv_train_file, csv_test_file]

        num_cols = len(self.headers)
        context_cols = num_cols
        print("num_cols", num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
        )
        libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")

        print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
    def test_csv_to_libfm(self):

        input_file = folder + 'yelp.csv_train_0'
        expected_file = input_file + ".libfm"
        output_file = expected_file + "_test"

        if os.path.isfile(output_file):
            os.remove(output_file)

        delete_columns = []
        csv_to_libfm(
            input_file, output_file, 2, delete_columns=delete_columns,
            delimiter=',', has_header=True)

        self.assertTrue(filecmp.cmp(output_file, expected_file))

        if os.path.isfile(output_file):
            os.remove(output_file)
    def prepare(self):
        print('prepare: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(len(self.sense_groups))

        if Constants.USE_CONTEXT is True:
            for record in self.train_records:
                record.update(record[Constants.CONTEXT_WORDS_FIELD])

            for record in self.records_to_predict:
                record.update(record[Constants.CONTEXT_WORDS_FIELD])

            if Constants.FM_REVIEW_TYPE:
                self.train_records = ETLUtils.filter_records(
                    self.train_records, Constants.PREDICTED_CLASS_FIELD,
                    [Constants.FM_REVIEW_TYPE])

            # ETLUtils.drop_fields([Constants.TOPICS_FIELD], self.train_records)

        ETLUtils.keep_fields(self.headers, self.train_records)
        ETLUtils.keep_fields(self.headers, self.records_to_predict)

        ETLUtils.save_csv_file(
            self.csv_train_file, self.train_records, self.headers)
        ETLUtils.save_csv_file(
            self.csv_test_file, self.records_to_predict, self.headers)

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
Beispiel #5
0
    def test_csv_to_libfm(self):

        input_file = folder + 'yelp.csv_train_0'
        expected_file = input_file + ".libfm"
        output_file = expected_file + "_test"

        if os.path.isfile(output_file):
            os.remove(output_file)

        delete_columns = []
        csv_to_libfm(input_file,
                     output_file,
                     2,
                     delete_columns=delete_columns,
                     delimiter=',',
                     has_header=True)

        self.assertTrue(filecmp.cmp(output_file, expected_file))

        if os.path.isfile(output_file):
            os.remove(output_file)
Beispiel #6
0
def main_converter():

    csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv'
    csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv'

    # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True)
    # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True)

    headers = ['stars', 'user_id', 'business_id']
    train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE)
    records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE)
    train_records = ETLUtils.select_fields(headers, train_records)
    records_to_predict = ETLUtils.select_fields(headers, records_to_predict)

    ETLUtils.save_csv_file(csv_train_file, train_records, headers)
    ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers)

    csv_files = [
        csv_train_file,
        csv_test_file
    ]

    csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
Beispiel #7
0
    def prepare_records_for_libfm(self):
        print('prepare_records_for_libfm: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(self.context_rich_topics)

        if Constants.USE_CONTEXT is True:

            if Constants.REVIEW_TYPE == Constants.SPECIFIC or \
                            Constants.REVIEW_TYPE == Constants.GENERIC:
                self.train_records = ETLUtils.filter_records(
                    self.train_records, Constants.PREDICTED_CLASS_FIELD,
                    [Constants.REVIEW_TYPE])

        with open(self.csv_train_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.train_records:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        context_topics = record[Constants.CONTEXT_TOPICS_FIELD]
                        # print('context_topics', context_topics)
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        self.train_records = None
        gc.collect()

        with open(self.csv_test_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.records_to_predict:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        important_record = record[Constants.REVIEW_ID_FIELD]
                        context_topics =\
                            self.context_topics_map[important_record]
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        # self.records_to_predict = None
        self.context_topics_map = None
        self.context_rich_topics = None
        gc.collect()

        print('Exported CSV and JSON files: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [self.csv_train_file, self.csv_test_file]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(csv_files,
                                     0, [1, 2], [],
                                     ',',
                                     has_header=True,
                                     suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    def prepare_records_for_libfm(self):
        print('prepare_records_for_libfm: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        self.headers = build_headers(self.context_rich_topics)

        if Constants.REVIEW_TYPE == Constants.SPECIFIC or \
                Constants.REVIEW_TYPE == Constants.GENERIC:
            self.train_records = ETLUtils.filter_records(
                self.train_records, Constants.PREDICTED_CLASS_FIELD,
                [Constants.REVIEW_TYPE])

        with open(self.csv_train_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.train_records:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        context_topics = record[Constants.CONTEXT_TOPICS_FIELD]
                        # print('context_topics', context_topics)
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        self.train_records = None
        gc.collect()

        with open(self.csv_test_file, 'w') as out_file:
            writer = csv.writer(out_file)

            # Write header
            writer.writerow(self.headers)

            for record in self.records_to_predict:
                row = []
                for header in basic_headers:
                    row.append(record[header])

                if Constants.USE_CONTEXT is True:
                    for topic in self.context_rich_topics:
                        important_record = record[Constants.REVIEW_ID_FIELD]
                        context_topics =\
                            self.context_topics_map[important_record]
                        row.append(context_topics['topic' + str(topic[0])])

                writer.writerow(row)

        # self.records_to_predict = None
        self.context_topics_map = None
        self.context_rich_topics = None
        gc.collect()

        print('Exported CSV and JSON files: %s'
              % time.strftime("%Y/%m/%d-%H:%M:%S"))

        csv_files = [
            self.csv_train_file,
            self.csv_test_file
        ]

        print('num_cols', len(self.headers))

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], [], ',', has_header=True,
            suffix='.libfm')

        print('Exported LibFM files: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))