def main():
    parser = ArgumentParser(usage='python extract_features.py --game_files '
                                  'GAME_FILE1,GAME_FILE2,...[ OPTIONS]',
        description='Extract features and add them to the Mongo database.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser_add_argument = parser.add_argument
    parser_add_argument('--game_files',
        help='Comma-separated list of file-names or "all" for all of the '
             'files (the game files should reside in the "data" directory; '
             'the .jsonlines suffix is not necessary, but the file-names '
             'should be exact matches otherwise).',
        type=str,
        required=True)
    parser_add_argument('--do_not_binarize_features',
        help='Do not make all non-zero feature frequencies equal to 1.',
        action='store_true',
        default=False)
    parser_add_argument('--do_not_lowercase_text',
        help='Do not make lower-casing part of the review text '
             'normalization step, which affects word n-gram-related '
             'features.',
        action='store_true',
        default=False)
    parser_add_argument('--lowercase_cngrams',
        help='Lower-case the review text before extracting character n-gram '
             'features.',
        action='store_true',
        default=False)
    parser_add_argument('--partition',
        help='Data partition, i.e., "training", "test", etc. Value must be a '
             'valid partition set name in the Mongo database. Alternatively, '
             'the value "all" can be used to include all partitions.',
        type=str,
        default='all')
    parser_add_argument('--do_not_reuse_extracted_features',
        help="Don't make use of previously-extracted features present in the"
             " Mongo database and instead replace them if they are.",
        action='store_true',
        default=False)
    parser_add_argument('-dbhost', '--mongodb_host',
        help='Host that the MongoDB server is running on.',
        type=str,
        default='localhost')
    parser_add_argument('-dbport', '--mongodb_port',
        help='Port that the MongoDB server is running on.',
        type=int,
        default=27017)
    parser_add_argument('-log', '--log_file_path',
        help='Path to feature extraction log file.',
        type=str,
        default=join(project_dir,
                     'logs',
                     'replog_extract_features.txt'))
    args = parser.parse_args()

    # Imports
    import logging
    from util.mongodb import connect_to_db
    from util.datasets import get_game_files
    from src.features import extract_nlp_features_into_db

    # Make local copies of arguments
    game_files = args.game_files
    binarize = not args.do_not_binarize_features
    reuse_features = not args.do_not_reuse_extracted_features
    lowercase_text = not args.do_not_lowercase_text
    lowercase_cngrams = args.lowercase_cngrams
    partition = args.partition
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port

    # Setup logger and create logging handlers
    logger = logging.getLogger('extract_features')
    logging_debug = logging.DEBUG
    logger.setLevel(logging_debug)
    loginfo = logger.info
    logdebug = logger.debug
    logerr = logger.error
    logwarn = logger.warning
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -'
                                  ' %(message)s')
    sh = logging.StreamHandler()
    sh.setLevel(logging_debug)
    fh = logging.FileHandler(realpath(args.log_file_path))
    fh.setLevel(logging_debug)
    sh.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(sh)
    logger.addHandler(fh)

    # Print out some logging information about the upcoming tasks
    logdebug('Project directory: {}'.format(project_dir))
    logdebug('Binarize features? {}'.format(binarize))
    logdebug('Try to reuse previously-extracted features in the database? {}'
             .format(reuse_features))
    logdebug('Lower-case text as part of the normalization step? {}'
             .format(lowercase_text))
    logdebug('Lower-case character n-grams during feature extraction? {}'
             .format(lowercase_cngrams))

    # Establish connection to MongoDB database collection
    loginfo('Connecting to MongoDB database on mongodb://{}:{}...'
            .format(mongodb_host,
                    mongodb_port))
    reviewdb = connect_to_db(host=mongodb_host,
                             port=mongodb_port)
    reviewdb.write_concern['w'] = 0

    # Get list of games
    game_files = get_game_files(game_files,
                                join(dirname(dirname(__file__)),
                                     'data'))

    # Iterate over the game files, extracting and adding/replacing
    # features to the database
    for game_file in game_files:
        game = splitext(game_file)[0]
        if partition == 'all':
            partition_string = (' from the "training" and "test" data '
                                'partitions')
        else:
            partition_string = (' from the "{}" data partition'
                                .format(partition))
        loginfo('Extracting features{} for {}...'
                .format(partition_string,
                        game))
        extract_nlp_features_into_db(reviewdb,
                                     partition,
                                     game,
                                     reuse_nlp_feats=reuse_features,
                                     use_binarized_nlp_feats=binarize,
                                     lowercase_text=lowercase_text,
                                     lowercase_cngrams=lowercase_cngrams)
Example #2
0
def main():
    parser = \
        ArgumentParser(usage='python make_train_test_sets.py --game_files '
                             'GAME_FILE1,GAME_FILE2,...[ OPTIONS]',
                       description='Build train/test sets for each game. Take'
                                   ' up to 21k reviews and split it 80/20 '
                                   'training/test, respectively, by default. '
                                   'Both the maximum size and the percentage '
                                   'split can be altered via command-line '
                                   'flags. All selected reviews will be put '
                                   'into the "reviews_project" database\'s '
                                   '"reviews" collection (which is being '
                                   ' hosted on lemur.montclair.edu on port '
                                   '27017).',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser_add_argument = parser.add_argument
    parser_add_argument('--game_files',
        help='Comma-separated list of file-names or "all" for all of the '
             'files (the game files should reside in the "data" directory).',
        type=str,
        required=True)
    parser_add_argument('--max_size', '-m',
        help='Maximum number of reviews to get for training/testing (if '
             'possible).',
        type=int,
        default=4000)
    parser_add_argument('--percent_train', '-%',
        help='Percent of selected reviews for which to use for the training '
             'set, the rest going to the test set.',
        type=float,
        default=80.0)
    parser_add_argument('--convert_to_bins', '-bins',
        help='Number of sub-divisions of the hours-played values, e.g. if 10 '
             'and the hours values range from 0 up to 1000, then hours values'
             ' 0-99 will become 1, 100-199 will become 2, etc. (will '
             'probably be necessay to train a model that actually is '
             'predictive to an acceptable degree); note that both hours '
             'values will be retained, the original under the name "hours" '
             'and the converted value under the name "hours_bin".',
        type=int,
        required=False)
    parser_add_argument('--bin_factor',
        help='If the --convert_to_bins/-bins argument is specified, increase '
             'the sizes of the bins by the given factor so that bins in which'
             ' there will be lots of instances will be smaller in terms of '
             'range than bins that are more spasely-populated.',
        type=float,
        required=False)
    parser_add_argument('--make_reports', '-describe',
        help='Generate reports and histograms describing the data filtering '
             'procedure.',
        action='store_true',
        default=False)
    parser_add_argument('--just_describe',
        help='Generate reports and histograms describing the data filtering '
             'procedure, but then do NOT insert the reviews into the DB.',
        action='store_true',
        default=False)
    parser_add_argument('--reports_dir',
        help='If -describe/--make_reports is used, put generated reports in '
             'the given directory.',
        type=str,
        required=False)
    parser_add_argument('-dbhost', '--mongodb_host',
        help='Host that the MongoDB server is running on.',
        type=str,
        default='localhost')
    parser_add_argument('--mongodb_port', '-dbport',
        help='Port that the MongoDB server is running on.',
        type=int,
        default=27017)
    parser_add_argument('--log_file_path', '-log',
        help='Path for log file.',
        type=str,
        default=join(project_dir,
                     'logs',
                     'replog_make_train_test_sets.txt'))
    args = parser.parse_args()

    # Imports
    import logging
    from sys import exit
    from os import listdir
    from pymongo import MongoClient
    from util.datasets import get_game_files
    from util.mongodb import (connect_to_db,
                              insert_train_test_reviews)

    # Make local copies of arguments
    game_files = args.game_files
    max_size = args.max_size
    percent_train = args.percent_train
    convert_to_bins = args.convert_to_bins
    bin_factor = args.bin_factor
    make_reports = args.make_reports
    just_describe = args.just_describe
    reports_dir = args.reports_dir
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port

    # Initialize logging system
    logging_info = logging.INFO
    logger = logging.getLogger('make_train_test_sets')
    logger.setLevel(logging_info)

    # Create file handler
    fh = logging.FileHandler(abspath(args.log_file_path))
    fh.setLevel(logging_info)

    # Create console handler
    sh = logging.StreamHandler()
    sh.setLevel(logging_info)

    # Add nicer formatting
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -'
                                  ' %(message)s')
    fh.setFormatter(formatter)
    sh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(sh)

    loginfo = logger.info
    logerror = logger.error
    logwarn = logger.warning

    # Make sure value passed in via the --convert_to_bins/-bins option
    # flag makes sense and, if so, assign value to variable bins (if
    # not, set bins equal to 0)
    if (convert_to_bins
        and convert_to_bins < 2):
        logerror('The value passed in via --convert_to_bins/-bins must be '
                 'greater than one since there must be multiple bins to '
                 'divide the hours played values. Exiting.')
        exit(1)
    elif convert_to_bins:
        bins = convert_to_bins
    else:
        bins = 0

    # Make sure that, if the --bin_factor argument is specified, the
    # --convert_to_bins/-bins argument was also specified
    if (bin_factor
        and not convert_to_bins):
        logerror('The --bin_factor argument was specified despite the fact '
                 'that the --convert_to_bins/-bins argument was not used. '
                 'Exiting.')
        exit(1)

    # Establish connection to MongoDB database
    loginfo('Connecting to MongoDB database on mongodb://{}:{}...'
            .format(mongodb_host,
                    mongodb_port))
    reviewdb = connect_to_db(host=mongodb_host,
                             port=mongodb_port)
    reviewdb.write_concern['w'] = 0

    # Get path to the directories
    data_dir = join(project_dir,
                    'data')
    if reports_dir:
        reports_dir = realpath(reports_dir)

    # Make sure args make sense
    if max_size < 50:
        logerror('You can\'t be serious, right? You passed in a value of 50 '
                 'for the MAXIMUM size of the combination of training/test '
                 'sets? Exiting.')
        exit(1)
    if percent_train < 1.0:
        logerror('You can\'t be serious, right? You passed in a value of 1.0%'
                 ' for the percentage of the selected reviews that will be '
                 'devoted to the training set? That is not going to be enough'
                 ' training samples. Exiting.')
        exit(1)

    # Make sense of arguments
    if (make_reports
        and just_describe):
        logwarn('If the --just_describe and -describe/--make_reports option '
                'flags are used, --just_describe wins out, i.e., reports will'
                ' be generated, but no reviews will be inserted into the '
                'database.')
    elif (reports_dir
          and (make_reports
               or just_describe)):
        if not exists(reports_dir):
            logerror('The given --reports_dir path was invalid. Exiting.')
            exit(1)

    # Get list of games
    game_files = get_game_files(game_files,
                                join(dirname(dirname(__file__)),
                                     'data'))

    loginfo('Adding training/test partitions to Mongo DB for the following '
            'games: {}'.format(', '.join([splitext(game)[0]
                                          for game in game_files])))
    loginfo('Maximum size for the combined training/test sets: {}'
            .format(max_size))
    loginfo('Percentage split between training and test sets: {0:.2f}/{1:.2f}'
            .format(percent_train,
                    100.0 - percent_train))
    if make_reports:
        loginfo('Generating reports in {}.'
            .format(reports_dir if reports_dir
                                else join(data_dir,
                                          'reports')))
    if just_describe:
        loginfo('Exiting after generating reports.')
    if bins:
        loginfo('Converting hours played values to {} bins with a bin factor '
                'of {}.'.format(bins,
                                bin_factor))

    # For each game in our list of games, we will read in the reviews
    # from the data file and then put entries in our MongoDB collection
    # with a key that identifies each review as either training or test
    for game_file in game_files:
        loginfo('Getting/inserting reviews for {}...'
                .format(splitext(basename(game_file))[0]))
        insert_train_test_reviews(reviewdb,
                                  abspath(join(data_dir,
                                               game_file)),
                                  max_size,
                                  percent_train,
                                  bins=bins,
                                  bin_factor=bin_factor,
                                  describe=make_reports,
                                  just_describe=just_describe,
                                  reports_dir=reports_dir
                                                  if reports_dir
                                                  else join(data_dir,
                                                            'reports'))

    loginfo('Complete.')
def main():
    parser = ArgumentParser(
        usage='python extract_features.py --game_files '
        'GAME_FILE1,GAME_FILE2,...[ OPTIONS]',
        description='Extract features and add them to the Mongo database.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser_add_argument = parser.add_argument
    parser_add_argument(
        '--game_files',
        help='Comma-separated list of file-names or "all" for all of the '
        'files (the game files should reside in the "data" directory; '
        'the .jsonlines suffix is not necessary, but the file-names '
        'should be exact matches otherwise).',
        type=str,
        required=True)
    parser_add_argument(
        '--do_not_binarize_features',
        help='Do not make all non-zero feature frequencies equal to 1.',
        action='store_true',
        default=False)
    parser_add_argument(
        '--do_not_lowercase_text',
        help='Do not make lower-casing part of the review text '
        'normalization step, which affects word n-gram-related '
        'features.',
        action='store_true',
        default=False)
    parser_add_argument(
        '--lowercase_cngrams',
        help='Lower-case the review text before extracting character n-gram '
        'features.',
        action='store_true',
        default=False)
    parser_add_argument(
        '--partition',
        help='Data partition, i.e., "training", "test", etc. Value must be a '
        'valid partition set name in the Mongo database. Alternatively, '
        'the value "all" can be used to include all partitions.',
        type=str,
        default='all')
    parser_add_argument(
        '--do_not_reuse_extracted_features',
        help="Don't make use of previously-extracted features present in the"
        " Mongo database and instead replace them if they are.",
        action='store_true',
        default=False)
    parser_add_argument('-dbhost',
                        '--mongodb_host',
                        help='Host that the MongoDB server is running on.',
                        type=str,
                        default='localhost')
    parser_add_argument('-dbport',
                        '--mongodb_port',
                        help='Port that the MongoDB server is running on.',
                        type=int,
                        default=27017)
    parser_add_argument('-log',
                        '--log_file_path',
                        help='Path to feature extraction log file.',
                        type=str,
                        default=join(project_dir, 'logs',
                                     'replog_extract_features.txt'))
    args = parser.parse_args()

    # Imports
    import logging
    from util.mongodb import connect_to_db
    from util.datasets import get_game_files
    from src.features import extract_nlp_features_into_db

    # Make local copies of arguments
    game_files = args.game_files
    binarize = not args.do_not_binarize_features
    reuse_features = not args.do_not_reuse_extracted_features
    lowercase_text = not args.do_not_lowercase_text
    lowercase_cngrams = args.lowercase_cngrams
    partition = args.partition
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port

    # Setup logger and create logging handlers
    logger = logging.getLogger('extract_features')
    logging_debug = logging.DEBUG
    logger.setLevel(logging_debug)
    loginfo = logger.info
    logdebug = logger.debug
    logerr = logger.error
    logwarn = logger.warning
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -'
                                  ' %(message)s')
    sh = logging.StreamHandler()
    sh.setLevel(logging_debug)
    fh = logging.FileHandler(realpath(args.log_file_path))
    fh.setLevel(logging_debug)
    sh.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(sh)
    logger.addHandler(fh)

    # Print out some logging information about the upcoming tasks
    logdebug('Project directory: {}'.format(project_dir))
    logdebug('Binarize features? {}'.format(binarize))
    logdebug('Try to reuse previously-extracted features in the database? {}'.
             format(reuse_features))
    logdebug('Lower-case text as part of the normalization step? {}'.format(
        lowercase_text))
    logdebug(
        'Lower-case character n-grams during feature extraction? {}'.format(
            lowercase_cngrams))

    # Establish connection to MongoDB database collection
    loginfo('Connecting to MongoDB database on mongodb://{}:{}...'.format(
        mongodb_host, mongodb_port))
    reviewdb = connect_to_db(host=mongodb_host, port=mongodb_port)
    reviewdb.write_concern['w'] = 0

    # Get list of games
    game_files = get_game_files(game_files,
                                join(dirname(dirname(__file__)), 'data'))

    # Iterate over the game files, extracting and adding/replacing
    # features to the database
    for game_file in game_files:
        game = splitext(game_file)[0]
        if partition == 'all':
            partition_string = (' from the "training" and "test" data '
                                'partitions')
        else:
            partition_string = (
                ' from the "{}" data partition'.format(partition))
        loginfo('Extracting features{} for {}...'.format(
            partition_string, game))
        extract_nlp_features_into_db(reviewdb,
                                     partition,
                                     game,
                                     reuse_nlp_feats=reuse_features,
                                     use_binarized_nlp_feats=binarize,
                                     lowercase_text=lowercase_text,
                                     lowercase_cngrams=lowercase_cngrams)
Example #4
0
def main():
    parser = ArgumentParser(
        usage='python make_arff_files.py --game_files GAME_FILE1,GAME_FILE2[ '
              'OPTIONS]',
        description='Build .arff files for a specific game file, all game '
                    'files combined, or for each game file separately.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser_add_argument = parser.add_argument
    parser_add_argument('--game_files',
        help='Comma-separated list of file-names or "all" for all of the '
             'files (the game files should reside in the "data" directory; '
             'the .jsonlines suffix is not necessary, but the file-names '
             'should be exact matches otherwise).',
        type=str,
        required=True)
    parser_add_argument('--output_dir', '-o',
        help='Destination directory for ARFF files.',
        type=str,
        required=True)
    parser_add_argument('--mode',
        help='Make .arff file for each game file separately ("separate") or '
             'for all game files combined ("combined").',
        choices=["separate", "combined"],
        default="combined")
    parser_add_argument('--combined_file_prefix',
        help='If the "combined" value was passed in via the --mode flag '
             '(which happens by default unless specified otherwise), an '
             'output file prefix must be passed in via this option flag.',
        type=str,
        required=False)
    parser_add_argument('--use_original_hours_values',
        help='Use the unmodified hours played values; otherwise, use the '
             'collapsed values.',
        action='store_true',
        default=False)
    parser_add_argument('--use_mongodb',
        help='Search the MongoDB collection for training/test set reviews and'
             ' make ARFF files using them only (the file suffix ".train"/'
             '".test" will be appended onto the end of the output file name '
             'to distinguish the different files); note that, by default, '
             'collapsed hours played values will be used (if this is not '
             'desired, use the --use_original_hours_values flag).',
        action='store_true',
        default=False)
    parser_add_argument('--nbins',
        help='Specify the number of bins in which to collapse hours played '
             'values; to be used if the --make_train_test_sets flag is not '
             'being used, in which case pre-computed hours played values will'
             ' not be read in from the database, but you still want the '
             'values to be in the form of bins (i.e., 1 for 0-100, 2 for '
             '101-200, etc., depending on the minimum and maximum values and '
             'the number of bins specified).',
        type=int,
        required=False)
    parser_add_argument('--bin_factor',
        help='Factor by which to multiply the sizes of the bins, such that '
             'the bins with lots of values will be smaller and the more '
             'sparsely-populated bins will be smaller in terms of range.',
        type=float,
        default=1.0)
    parser_add_argument('-dbhost', '--mongodb_host',
        help='Host that the MongoDB server is running on.',
        type=str,
        default='localhost')
    parser_add_argument('--mongodb_port', '-dbport',
        help='Port that the MongoDB server is running on.',
        type=int,
        default=27017)
    parser_add_argument('--log_file_path', '-log',
        help='Path for log file.',
        type=str,
        default=join(project_dir,
                     'logs',
                     'replog_make_arff.txt'))
    args = parser.parse_args()

    # Imports
    import os
    import logging
    from re import sub
    from sys import exit
    from util.mongodb import connect_to_db
    from util.datasets import (get_game_files,
                               get_bin_ranges,
                               write_arff_file,
                               get_and_describe_dataset)

    # Make local copies of arguments
    game_files = args.game_files
    output_dir = args.output_dir
    mode = args.mode
    combined_file_prefix = args.combined_file_prefix
    use_mongodb = args.use_mongodb
    nbins = args.nbins
    bins = not args.use_original_hours_values
    bin_factor = args.bin_factor
    mongodb_host = args.mongodb_host
    mongodb_port = args.mongodb_port

    # Initialize logging system
    logging_info = logging.INFO
    logger = logging.getLogger('make_arff_files')
    logger.setLevel(logging_info)
    # Create file handler
    fh = logging.FileHandler(abspath(args.log_file_path))
    fh.setLevel(logging_info)
    # Create console handler
    sh = logging.StreamHandler()
    sh.setLevel(logging_info)
    # Add nicer formatting
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -'
                                  ' %(message)s')
    fh.setFormatter(formatter)
    sh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(sh)
    loginfo = logger.info
    logerror = logger.error
    logwarn = logger.warning

    # Check if the output directory exists
    output_dir = abspath(output_dir)
    if not (exists(output_dir)
            and isdir(output_dir)):
        logerror('The given output directory, {}, for ARFF files does not '
                 'exist or is not a directory. Exiting.'.format(output_dir))
        exit(1)

    # Make sure --bins option flag makes sense
    if nbins:
        if use_mongodb:
            logerror('If the --use_mongodb flag is used, a number of bins in '
                     'which to collapse the hours played values cannot be '
                     'specified (since the values in the database were '
                     'pre-computed). Exiting.')
            exit(1)
        elif not bins:
            logerror('Conflict between the --use_original_hours_values and '
                     '--nbins flags. Both cannot be used at the same time.')
            exit(1)
    elif (bins
          and not use_mongodb):
        loginfo('If both the --use_original_hours_values and --use_mongodb '
                'flags are not used, then the number of bins in which to '
                'collapse the hours played values must be specified via the '
                '--nbins option argument. Exiting.')
        exit(1)

    # Exit if the --bin_factor argument was used despite the fact that
    # the original hours values are not being binned
    if (not bins
        and bin_factor > 1.0):
        logerror('The --bin_factor argument was specified despite the fact '
                 'that the original hours values are being binned. Exiting.')
        exit(1)

    # Get path to the data directory
    data_dir = join(project_dir,
                    'data')
    if bins:
        arff_files_dir = join(output_dir,
                              'arff_files_collapsed_values')
    else:
        arff_files_dir = join(output_dir,
                              'arff_files_original_values')
    loginfo('data directory: {}'.format(data_dir))
    loginfo('arff files directory: {}'.format(output_dir))

    # Make sure there is a combined output file prefix if "combine" is
    # the value passed in via --mode
    if (mode == 'combined'
        and not combined_file_prefix):
        logerror('A combined output file prefix must be specified in cases '
                 'where the "combined" value was passed in via the --mode '
                 'option flag (or --mode was not specified, in which case '
                 '"combined" is the default value). Exiting.')
        exit(1)

    '''
    See if the --use_mongodb flag was used, in which case we have to
    make a connection to the MongoDB collection. And, if it wasn't
    used, then print out warning if the --mongodb_port flag was used
    (since it will be ignored) unless the value is equal to the default
    value (since it probably wasn't specified in that case).
    '''
    if use_mongodb:
        loginfo('Connecting to MongoDB database on mongodb://{}:{}...'
                .format(mongodb_host,
                        mongodb_port))
        reviewdb = connect_to_db(host=mongodb_host,
                                 port=mongodb_port)
    elif (mongodb_port
          and not mongodb_port == 27017):
        logwarn('Ignoring argument passed in via the --mongodb_port/-dbport '
                'option flag since the --use_mongodb flag was not also used, '
                'which means that the MongoDB database is not going to be '
                'used.')

    game_files = get_game_files(game_files,
                                join(dirname(dirname(__file__)),
                                     'data'))
    if len(game_files) == 1:
        # Print out warning message if --mode was set to "combined" and
        # there was only one file n the list of game files since only a
        # single ARFF file will be created
        if mode == 'combined':
            logwarn('The --mode flag was used with the value "combined" (or '
                    'was unspecified) even though only one game file was '
                    'passed in via the --game_files flag. Only one file will '
                    'be written and it will be named after the game.')
        mode = "separate"

    # Make a list of dicts corresponding to each review and write .arff
    # files
    loginfo('Reading in data from reviews files...')
    if mode == "combined":
        review_dicts_list = []
        if not use_mongodb:
            # Min/max values of hours played (i.e., game experience)
            if bins:
                minh = 0.0
                maxh = 0.0
            for game_file in game_files:
                loginfo('Getting review data from {}...'.format(game_file))
                dataset = get_and_describe_dataset(join(data_dir,
                                                        game_file),
                                                   report=False)
                review_dicts_list.extend(dataset['reviews'])
                # If the hours played values are to be divided into
                # bins, update the min/max values
                if bins:
                    if dataset['minh'] < minh:
                        minh = dataset['minh']
                    if dataset['max'] > maxh:
                        maxh = dataset['maxh']
            # If the hours played values are to be divided into bins,
            # get the range that each bin maps to
            if bins:
                bin_ranges = get_bin_ranges(minh,
                                            maxh,
                                            nbins,
                                            bin_factor)
            else:
                bin_ranges = False
        file_names = [splitext(game)[0] for game in game_files]
        arff_file = join(arff_files_dir,
                         '{}.arff'.format(combined_file_prefix))
        if use_mongodb:
            loginfo('Generating ARFF files for the combined training sets and'
                    ' the combined test sets, respectively, of the following '
                    'games:\n\n{}'.format(', '.join([sub(r'_',
                                                         r' ',
                                                         fname) for fname in
                                                     file_names])))
            write_arff_file(arff_file,
                            file_names,
                            reviewdb=reviewdb,
                            make_train_test=True,
                            bins=True)
        else:
            loginfo('Generating {}...'.format(arff_file))
            write_arff_file(arff_file,
                            file_names,
                            reviews=review_dicts_list,
                            bins=bin_ranges)
    else:
        for game_file in game_files:
            loginfo('Getting review data from {}...'.format(game_file))
            if not use_mongodb:
                review_dicts_list = []
                dataset = get_and_describe_dataset(join(data_dir,
                                                        game_file),
                                                   report=False)
                review_dicts_list.extend(dataset['reviews'])
                if bins:
                    # Get min/max hours played values from results of
                    # get_and_describe_dataset() call
                    minh = dataset['minh']
                    maxh = dataset['maxh']
                    # Get the range that each bin maps to
                    bin_ranges = get_bin_ranges(minh,
                                                maxh,
                                                nbins,
                                                bin_factor)
                else:
                    bin_ranges = False
            game = splitext(game_file)[0]
            arff_file = join(arff_files_dir,
                             '{}.arff'.format(game))
            if use_mongodb:
                loginfo('Generating ARFF file for the training and test sets '
                        'for {}...'.format(game))
                write_arff_file(arff_file,
                                [game],
                                reviewdb=reviewdb,
                                make_train_test=True,
                                bins=bins)
            else:
                loginfo('Generating {}...'.format(arff_file))
                write_arff_file(arff_file,
                                [game],
                                reviews=review_dicts_list,
                                bins=bin_ranges)
    loginfo('Complete.')