def setUp(self): try: self.db = connect_to_db('localhost', 37017) except AutoReconnect as e: raise ConnectionFailure('Could not connect to MongoDB client. Make' ' sure a tunnel is set up (or some other ' 'method is used) before running the ' 'tests.')
def setUp(self): try: self.db = connect_to_db('localhost', 37017) except AutoReconnect as e: raise ConnectionFailure('Could not connect to MongoDB client. Make' 'sure a tunnel is set up (or some other ' 'method is used) before running the ' 'tests.') self.prediction_label = 'total_game_hours' self.output_path = join(this_dir, 'test_output') if exists(self.output_path): rmtree(self.output_path) makedirs(self.output_path)
def main(argv=None): parser = ArgumentParser(description='Run incremental learning ' 'experiments.', formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve') _add_arg = parser.add_argument _add_arg('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') _add_arg('--mongodb_port', '-dbport', help='Port that the MongoDB server is running on.', type=int, default=37017) args = parser.parse_args() # Imports import sys from pymongo import ASCENDING from pymongo.errors import ConnectionFailure from src.mongodb import connect_to_db # Connect to MongoDB database logger.info('Connecting to MongoDB database at {0}:{1}...' .format(args.mongodb_host, args.mongodb_port)) try: db = connect_to_db(args.mongodb_host, args.mongodb_port) except ConnectionFailure as e: logger.error('Failed to connect to the MongoDB database collection.') raise e # Create index on 'steam_id_number' so that cursors can be sorted # on that particular key logger.info('Creating index on the "steam_id_number" key.') db.create_index('steam_id_number', ASCENDING) logger.info('Created new index named "steam_id_number_1" in the "reviews" ' 'collection.')
def main(): parser = ArgumentParser( usage='python extract_features.py --game_files ' 'GAME_FILE1,GAME_FILE2,...[ OPTIONS]', description='Extract features and add them to the Mongo database.', formatter_class=ArgumentDefaultsHelpFormatter) _add_arg = parser.add_argument _add_arg('--game_files', help='Comma-separated list of file-names or "all" for all of the' ' files (the game files should reside in the "data" ' 'directory; the .jsonlines suffix is not necessary, but the' ' file-names should be exact matches otherwise).', type=str, required=True) _add_arg('--do_not_binarize_features', help='Do not make all non-zero feature frequencies equal to 1.', action='store_true', default=False) _add_arg('--do_not_lowercase_text', help='Do not make lower-casing part of the review text ' 'normalization step, which affects word n-gram-related ' 'features.', action='store_true', default=False) _add_arg('--lowercase_cngrams', help='Lower-case the review text before extracting character ' 'n-gram features.', action='store_true', default=False) _add_arg('--partition', help='Data partition, i.e., "training", "test", etc. Value must ' 'be a valid partition set name in the Mongo database. ' 'Alternatively, the value "all" can be used to include all ' 'partitions.', type=str, default='all') _add_arg('--do_not_reuse_extracted_features', help="Don't make use of previously-extracted features present in" " the Mongo database and instead replace them if they are.", action='store_true', default=False) _add_arg('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') _add_arg('-dbport', '--mongodb_port', help='Port that the MongoDB server is running on.', type=int, default=27017) _add_arg('--update_batch_size', '-batch_size', help='Size of each batch for the bulk updates.', type=int, default=100) _add_arg('-log', '--log_file_path', help='Path to feature extraction log file.', type=str, default=join(log_dir, 'replog_extract_features.txt')) args = parser.parse_args() # Imports from pymongo.errors import (BulkWriteError, ConnectionFailure) from src import (get_game_files, log_format_string) from src.mongodb import (connect_to_db, bulk_extract_features_and_update_db) # Make local copies of arguments game_files = args.game_files binarize = not args.do_not_binarize_features reuse_features = not args.do_not_reuse_extracted_features lowercase_text = not args.do_not_lowercase_text lowercase_cngrams = args.lowercase_cngrams partition = args.partition mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port update_batch_size = args.update_batch_size if update_batch_size < 1: raise ValueError('--update_batch_size/-batch_size should be greater ' 'than 0.') # Make sure log file directory exists log_file_path = realpath(args.log_file_path) log_file_dir = dirname(log_file_path) if not exists(log_file_dir): makedirs(log_file_dir, exist_ok=True) # Setup file handler fh = logging.FileHandler(log_file_path) fh.setLevel(logging_debug) fh.setFormatter(formatter) logger.addHandler(fh) # Print out some logging information about the upcoming tasks logdebug('Project directory: {0}'.format(project_dir)) logdebug('Binarize features? {0}'.format(binarize)) logdebug('Try to reuse previously-extracted features in the database? {0}'. format(reuse_features)) logdebug('Lower-case text as part of the normalization step? {0}'.format( lowercase_text)) logdebug( 'Lower-case character n-grams during feature extraction? {0}'.format( lowercase_cngrams)) logdebug('Batch size for database updates: {0}'.format(update_batch_size)) # Establish connection to MongoDB database collection loginfo('Connecting to MongoDB database on mongodb://{0}:{1}...'.format( mongodb_host, mongodb_port)) try: db = connect_to_db(host=mongodb_host, port=mongodb_port) except ConnectionFailure as e: logerr('Unable to connect to MongoDB reviews collection.') logerr(e) raise e db.write_concern['w'] = 0 # Get list of games game_files = get_game_files(game_files) # Iterate over the game files, extracting and adding/replacing # features to the database for game_file in game_files: game = splitext(game_file)[0] if partition == 'all': partition_string = ' from the "training" and "test" data partitions' else: partition_string = ' from the "{0}" data partition'.format( partition) loginfo('Extracting features{0} for {1}...'.format( partition_string, game)) try: updates = \ bulk_extract_features_and_update_db(db, game, partition, reuse_nlp_feats=reuse_features, use_binarized_nlp_feats=binarize, lowercase_text=lowercase_text, lowercase_cngrams=lowercase_cngrams, update_batch_size=update_batch_size) except BulkWriteError as bwe: logerr('Encountered a BulkWriteError while executing the call to ' '`bulk_extract_features_and_update_db`.') raise bwe if updates: loginfo( '{0} updates were made to the reviews collection.'.format(updates)) else: raise ValueError('No updates were made.')
def main(): parser = ArgumentParser(usage='python make_arff_files.py --game_files ' 'GAME_FILE1,GAME_FILE2[ OPTIONS]', description='Build .arff files for a specific ' 'game file, all game files combined, ' 'or for each game file separately.', formatter_class=ArgumentDefaultsHelpFormatter) _add_arg = parser.add_argument _add_arg('--game_files', help='Comma-separated list of file-names or "all" for all of the' ' files (the game files should reside in the "data" ' 'directory; the .jsonlines suffix is not necessary, but the' ' file-names should be exact matches otherwise).', type=str, required=True) _add_arg('--output_dir', '-o', help='Destination directory for ARFF files.', type=str, required=True) _add_arg('--mode', help='Make .arff file for each game file separately ("separate")' ' or for all game files combined ("combined").', choices=["separate", "combined"], default="combined") _add_arg('--combined_file_prefix', help='If the "combined" value was passed in via the --mode flag ' '(which happens by default unless specified otherwise), an ' 'output file prefix must be passed in via this option ' 'flag.', type=str, required=False) _add_arg('--use_original_hours_values', help='Use the unmodified hours played values; otherwise, use the' ' collapsed values.', action='store_true', default=False) _add_arg('--use_mongodb', help='Search the MongoDB collection for training/test set ' 'reviews and make ARFF files using them only (the file ' 'suffix ".train"/".test" will be appended onto the end of ' 'the output file name to distinguish the different files); ' 'note that, by default, collapsed hours played values will ' 'be used (if this is not desired, use the ' '--use_original_hours_values flag).', action='store_true', default=False) _add_arg('--nbins', help='Specify the number of bins in which to collapse hours ' 'played values; to be used if the --make_train_test_sets ' 'flag is not being used, in which case pre-computed hours ' 'played values will not be read in from the database, but ' 'you still want the values to be in the form of bins (i.e.,' ' 1 for 0-100, 2 for 101-200, etc., depending on the ' 'minimum and maximum values and the number of bins ' 'specified).', type=int, required=False) _add_arg('--bin_factor', help='Factor by which to multiply the sizes of the bins, such ' 'that the bins with lots of values will be smaller and the ' 'more sparsely-populated bins will be smaller in terms of ' 'range.', type=float, default=1.0) _add_arg('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') _add_arg('--mongodb_port', '-dbport', help='Port that the MongoDB server is running on.', type=int, default=27017) _add_arg('--log_file_path', '-log', help='Path for log file.', type=str, default=join(log_dir, 'replog_make_arff.txt')) args = parser.parse_args() # Imports import os from re import sub import numpy as np from src.mongodb import (connect_to_db, get_game_files) from src.datasets import (get_bin_ranges, write_arff_file, get_and_describe_dataset) # Make local copies of arguments game_files = args.game_files output_dir = args.output_dir mode = args.mode combined_file_prefix = args.combined_file_prefix use_mongodb = args.use_mongodb nbins = args.nbins bins = not args.use_original_hours_values bin_factor = args.bin_factor mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port # Make sure log file directory exists log_file_path = realpath(args.log_file_path) log_file_dir = dirname(log_file_path) if not exists(log_file_dir): makedirs(log_file_dir, exist_ok=True) # Make file handler fh = logging.FileHandler(log_file_path) fh.setLevel(logging_info) fh.setFormatter(formatter) logger.addHandler(fh) # Check if the output directory exists output_dir = realpath(output_dir) if not exists(output_dir) and isdir(output_dir): msg = ('The given output directory, {0}, for ARFF files does not ' 'exist or is not a directory.'.format(output_dir)) logerr(msg) raise ValueError(msg) # Make sure --bins option flag makes sense if nbins: if use_mongodb: msg = ('If the --use_mongodb flag is used, a number of bins in ' 'which to collapse the hours played values cannot be ' 'specified (since the values in the database were ' 'pre-computed).') logerr(msg) raise ValueError(msg) elif not bins: msg = ('Conflict between the --use_original_hours_values and ' '--nbins flags. Both cannot be used at the same time.') logerr(msg) raise ValueError(msg) elif bins and not use_mongodb: msg = ('If both the --use_original_hours_values and --use_mongodb ' 'flags are not used, then the number of bins in which to ' 'collapse the hours played values must be specified via the ' '--nbins option argument.') loginfo(msg) raise ValueError(msg) # Exit if the --bin_factor argument was used despite the fact that # the original hours values are not being binned if not bins and bin_factor > 1.0: msg = ('The --bin_factor argument was specified despite the fact ' 'that the original hours values are being binned.') logerr(msg) raise ValueError(msg) # Get path to the data directory if bins: arff_files_dir = join(output_dir, 'arff_files_collapsed_values') else: arff_files_dir = join(output_dir, 'arff_files_original_values') loginfo('data directory: {0}'.format(data_dir)) loginfo('arff files directory: {0}'.format(output_dir)) # Make sure there is a combined output file prefix if "combine" is # the value passed in via --mode if mode == 'combined' and not combined_file_prefix: msg = ('A combined output file prefix must be specified in cases ' 'where the "combined" value was passed in via the --mode ' 'option flag (or --mode was not specified, in which case ' '"combined" is the default value).') logerr(msg) raise ValueError(msg) """ See if the --use_mongodb flag was used, in which case we have to make a connection to the MongoDB collection. And, if it wasn't used, then print out warning if the --mongodb_port flag was used (since it will be ignored) unless the value is equal to the default value (since it probably wasn't specified in that case). """ if use_mongodb: loginfo('Connecting to MongoDB database on mongodb://{0}:{1}...' .format(mongodb_host, mongodb_port)) reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port) elif (mongodb_port and not mongodb_port == 27017): logwarn('Ignoring argument passed in via the --mongodb_port/-dbport ' 'option flag since the --use_mongodb flag was not also used, ' 'which means that the MongoDB database is not going to be ' 'used.') game_files = get_game_files(game_files) if len(game_files) == 1: # Print out warning message if --mode was set to "combined" and # there was only one file n the list of game files since only a # single ARFF file will be created if mode == 'combined': logwarn('The --mode flag was used with the value "combined" (or ' 'was unspecified) even though only one game file was ' 'passed in via the --game_files flag. Only one file will ' 'be written and it will be named after the game.') mode = "separate" # Make a list of dicts corresponding to each review and write .arff # files loginfo('Reading in data from reviews files...') if mode == "combined": review_dicts_list = [] if not use_mongodb: for game_file in game_files: loginfo('Getting review data from {0}...'.format(game_file)) (review_dicts_list .extend(get_and_describe_dataset(join(data_dir, game_file), report=False))) # If the hours played values are to be divided into bins, # get the range that each bin maps to if bins: bin_ranges = get_bin_ranges(min([r['total_game_hours'] for r in review_dicts_list]), max([r['total_game_hours'] for r in review_dicts_list]), nbins, bin_factor) else: bin_ranges = False file_names = [splitext(game)[0] for game in game_files] arff_file = join(arff_files_dir, '{0}.arff'.format(combined_file_prefix)) if use_mongodb: loginfo('Generating ARFF files for the combined training sets and' ' the combined test sets, respectively, of the following ' 'games:\n\n{0}' .format(', '.join([sub(r'_', r' ', fname) for fname in file_names]))) write_arff_file(arff_file, file_names, reviewdb=reviewdb, make_train_test=True, bins=True) else: loginfo('Generating {0}...'.format(arff_file)) write_arff_file(arff_file, file_names, reviews=review_dicts_list, bins=bin_ranges) else: for game_file in game_files: loginfo('Getting review data from {0}...'.format(game_file)) if not use_mongodb: review_dicts_list = get_and_describe_dataset(join(data_dir, game_file), report=False) if bins: bin_ranges = get_bin_ranges(min([r['total_game_hours'] for r in review_dicts_list]), max([r['total_game_hours'] for r in review_dicts_list]), nbins, bin_factor) else: bin_ranges = False game = splitext(game_file)[0] arff_file = join(arff_files_dir, '{0}.arff'.format(game)) if use_mongodb: loginfo('Generating ARFF file for the training and test sets ' 'for {0}...'.format(game)) write_arff_file(arff_file, [game], reviewdb=reviewdb, make_train_test=True, bins=bins) else: loginfo('Generating {0}...'.format(arff_file)) write_arff_file(arff_file, [game], reviews=review_dicts_list, bins=bin_ranges) loginfo('Complete.')
def main(): parser = \ ArgumentParser(usage='python make_train_test_sets.py --game_files ' 'GAME_FILE1,GAME_FILE2,...[ OPTIONS]', description='Build train/test sets for each game. Take' ' up to 21k reviews and split it 80/20 ' 'training/test, respectively, by default. ' 'Both the maximum size and the percentage ' 'split can be altered via command-line ' 'flags. All selected reviews will be put ' 'into the "reviews_project" database\'s ' '"reviews" collection (which is being ' ' hosted on lemur.montclair.edu on port ' '27017).', formatter_class=ArgumentDefaultsHelpFormatter) _add_arg = parser.add_argument _add_arg('--game_files', help='Comma-separated list of file-names or "all" for all of the' ' files (the game files should reside in the "data" ' 'directory).', type=str, required=True) _add_arg('--max_size', '-m', help='Maximum number of reviews to get for training/testing (if ' 'possible).', type=int, default=4000) _add_arg('--percent_train', '-%', help='Percent of selected reviews for which to use for the ' 'training set, the rest going to the test set.', type=float, default=80.0) _add_arg('--convert_to_bins', '-bins', help='Number of sub-divisions of the hours-played values, e.g. ' 'if 10 and the hours values range from 0 up to 1000, then ' 'hours values 0-99 will become 1, 100-199 will become 2, ' 'etc. (will probably be necessay to train a model that ' 'actually is predictive to an acceptable degree); note that' ' both hours values will be retained, the original under ' 'the name "hours" and the converted value under the name ' '"hours_bin".', type=int, required=False) _add_arg('--bin_factor', help='If the --convert_to_bins/-bins argument is specified, ' 'increase the sizes of the bins by the given factor so that' ' bins in which there will be lots of instances will be ' 'smaller in terms of range than bins that are more ' 'sparsely-populated.', type=float, required=False) _add_arg('--make_reports', '-describe', help='Generate reports and histograms describing the data ' 'filtering procedure.', action='store_true', default=False) _add_arg('--just_describe', help='Generate reports and histograms describing the data ' 'filtering procedure, but then do NOT insert the reviews ' 'into the MongoDB database.', action='store_true', default=False) _add_arg('--reports_dir', help='If -describe/--make_reports is used, put generated reports' ' in the given directory.', type=str, required=False) _add_arg('-dbhost', '--mongodb_host', help='Host that the MongoDB server is running on.', type=str, default='localhost') _add_arg('--mongodb_port', '-dbport', help='Port that the MongoDB server is running on.', type=int, default=27017) _add_arg('--log_file_path', '-log', help='Path for log file.', type=str, default=join(log_dir, 'replog_make_train_test_sets.txt')) args = parser.parse_args() # Imports from os import listdir from pymongo import MongoClient from pymongo.errors import ConnectionFailure from src import get_game_files from src.mongodb import (connect_to_db, insert_train_test_reviews) # Make local copies of arguments game_files = args.game_files max_size = args.max_size percent_train = args.percent_train convert_to_bins = args.convert_to_bins bin_factor = args.bin_factor make_reports = args.make_reports just_describe = args.just_describe reports_dir = args.reports_dir mongodb_host = args.mongodb_host mongodb_port = args.mongodb_port # Make sure log file directory exists log_file_path = realpath(args.log_file_path) log_file_dir = dirname(log_file_path) if not exists(log_file_dir): makedirs(log_file_dir, exist_ok=True) # Make file handler fh = logging.FileHandler(log_file_path) fh.setLevel(logging_info) fh.setFormatter(formatter) logger.addHandler(fh) # Make sure value passed in via the --convert_to_bins/-bins option # flag makes sense and, if so, assign value to variable bins (if # not, set bins equal to 0) if convert_to_bins and convert_to_bins < 2: msg = ('The value passed in via --convert_to_bins/-bins must be ' 'greater than one since there must be multiple bins to ' 'divide the hours played values.') logerr(msg) raise ValueError(msg) elif convert_to_bins: bins = convert_to_bins else: bins = 0 # Make sure that, if the --bin_factor argument is specified, the # --convert_to_bins/-bins argument was also specified if bin_factor and not convert_to_bins: msg = ('The --bin_factor argument was specified despite the fact ' 'that the --convert_to_bins/-bins argument was not used.') logerr(msg) raise ValueError(msg) # Establish connection to MongoDB database loginfo('Connecting to MongoDB database on mongodb://{0}:{1}...' .format(mongodb_host, mongodb_port)) try: reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port) except ConnectionFailure as e: logerr('Unable to connect to MongoDB reviews collection.') logerr(e) raise e reviewdb.write_concern['w'] = 0 # Get path to the directories if reports_dir: reports_dir = realpath(reports_dir) if isfile(reports_dir): msg = ('The file path passed in via the --reports_dir leads to a ' 'file, not a directory.') logerr(msg) raise ValueError(msg) if not exists(reports_dir): makedirs(reports_dir, exist_ok=True) # Make sure args make sense if max_size < 50: msg = ('You can\'t be serious, right? You passed in a value of 50 for' ' the MAXIMUM size of the combination of training/test sets?') logerr(msg) raise ValueError(msg) if percent_train < 1.0: msg = ('You can\'t be serious, right? You passed in a value of 1.0%' ' for the percentage of the selected reviews that will be ' 'devoted to the training set? That is not going to be enough' ' training samples.') logerr(msg) raise ValueError(msg) # Make sense of arguments if (make_reports and just_describe): logwarn('If the --just_describe and -describe/--make_reports option ' 'flags are used, --just_describe wins out, i.e., reports will' ' be generated, but no reviews will be inserted into the ' 'database.') elif reports_dir and (make_reports or just_describe): if not exists(reports_dir): logerr('The given --reports_dir path was invalid. Exiting.') exit(1) # Get list of games game_files = get_game_files(game_files, join(dirname(dirname(__file__)), 'data')) loginfo('Adding training/test partitions to Mongo DB for the following ' 'games: {0}' .format(', '.join([splitext(game)[0] for game in game_files]))) loginfo('Maximum size for the combined training/test sets: {0}' .format(max_size)) loginfo('Percentage split between training and test sets: {0:.2f}/{1:.2f}' .format(percent_train, 100.0 - percent_train)) if make_reports: loginfo('Generating reports in {0}.' .format(reports_dir if reports_dir else default_reports_dir)) if just_describe: loginfo('Exiting after generating reports.') if bins: loginfo('Converting hours played values to {0} bins with a bin factor' ' of {1}.'.format(bins, bin_factor)) # For each game in our list of games, we will read in the reviews # from the data file and then put entries in our MongoDB collection # with a key that identifies each review as either training or test for game_file in game_files: loginfo('Getting/inserting reviews for {}...' .format(splitext(basename(game_file))[0])) insert_train_test_reviews(reviewdb, realpath(join(data_dir, game_file)), max_size, percent_train, bins=bins, bin_factor=bin_factor, describe=make_reports, just_describe=just_describe, reports_dir=reports_dir if reports_dir else default_reports_dir) loginfo('Complete.')