def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Loads a trained model and outputs predictions based \
                     on input feature files.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve')
    parser.add_argument('model_file',
                        help='Model file to load and use for generating \
                              predictions.')
    parser.add_argument('input_file',
                        help='A csv file, json file, or megam file \
                              (with or without the label column), \
                              with the appropriate suffix.',
                        nargs='+')
    parser.add_argument('-l', '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-p', '--positive_class',
                        help="If the model is only being used to predict the \
                              probability of a particular class, this \
                              specifies the index of the class we're \
                              predicting. 1 = second class, which is default \
                              for binary classification. Keep in mind that \
                              classes are sorted lexicographically.",
                        default=1, type=int)
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('-t', '--threshold',
                        help="If the model we're using is generating \
                              probabilities of the positive class, return 1 \
                              if it meets/exceeds the given threshold and 0 \
                              otherwise.",
                        type=float)
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    # Create the classifier and load the model
    predictor = Predictor(args.model_file,
                          positive_class=args.positive_class,
                          threshold=args.threshold)

    for input_file in args.input_file:
        data = load_examples(input_file, quiet=args.quiet,
                             label_col=args.label_col)
        for pred in predictor.predict(data):
            print(pred)
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Loads a trained model and outputs predictions based \
                     on input feature files.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve')
    parser.add_argument('model_file',
                        help='Model file to load and use for generating \
                              predictions.')
    parser.add_argument('input_file',
                        help='A csv file, json file, or megam file \
                              (with or without the label column), \
                              with the appropriate suffix.',
                        nargs='+')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-p',
                        '--positive_class',
                        help="If the model is only being used to predict the \
                              probability of a particular class, this \
                              specifies the index of the class we're \
                              predicting. 1 = second class, which is default \
                              for binary classification. Keep in mind that \
                              classes are sorted lexicographically.",
                        default=1,
                        type=int)
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('-t',
                        '--threshold',
                        help="If the model we're using is generating \
                              probabilities of the positive class, return 1 \
                              if it meets/exceeds the given threshold and 0 \
                              otherwise.",
                        type=float)
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    # Create the classifier and load the model
    predictor = Predictor(args.model_file,
                          positive_class=args.positive_class,
                          threshold=args.threshold)

    for input_file in args.input_file:
        data = load_examples(input_file,
                             quiet=args.quiet,
                             label_col=args.label_col)
        for pred in predictor.predict(data):
            print(pred)
def main():
    '''
    Create directories and split CSV files into subsets.
    '''
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=logging.INFO)
    logger = logging.getLogger(__name__)
    if not (os.path.exists('train.csv') and os.path.exists('test.csv')):
        logger.error('This script requires the train.csv and test.csv files ' +
                     'from http://www.kaggle.com/c/titanic-gettingStarted/' +
                     'data to be in the current directory in order to work. ' +
                     'Please download them and try again.')
        sys.exit(1)

    # Create dictionary of subsets to use for creating split feature files
    subset_dict = {'vitals': ['Sex', 'Age'],
                   'socioeconomic': ['Pclass', 'Fare'],
                   'family': ['SibSp', 'Parch'],
                   'misc': ['Embarked']}

    # Create directories to store files
    if not os.path.exists('titanic/train'):
        logger.info('Creating titanic/train directory')
        os.makedirs('titanic/train')
    if not os.path.exists('titanic/dev'):
        logger.info('Creating titanic/dev directory')
        os.makedirs('titanic/dev')
    if not os.path.exists('titanic/train+dev'):
        logger.info('Creating titanic/train+dev directory')
        os.makedirs('titanic/train+dev')
    if not os.path.exists('titanic/test'):
        logger.info('Creating titanic/test directory')
        os.makedirs('titanic/test')

    # Read and write training examples
    train_examples = load_examples('train.csv', label_col='Survived',
                                   quiet=False, sparse=False)
    num_train_dev = len(train_examples.classes)
    num_train = int((num_train_dev / 5) * 4)
    train_ids = list(range(1, num_train_dev + 1))
    write_feature_file('titanic/train/.csv',
                       train_ids[:num_train],
                       train_examples.classes[:num_train],
                       train_examples.features[:num_train, :],
                       feat_vectorizer=train_examples.feat_vectorizer,
                       subsets=subset_dict, label_col='Survived',
                       id_prefix='train_example')

    # Write train+dev set for training model to use to generate predictions on test
    write_feature_file('titanic/train+dev/.csv',
                       train_ids,
                       train_examples.classes,
                       train_examples.features,
                       feat_vectorizer=train_examples.feat_vectorizer,
                       subsets=subset_dict, label_col='Survived',
                       id_prefix='train_example')

    # Write dev examples
    write_feature_file('titanic/dev/.csv',
                       train_ids[num_train:],
                       train_examples.classes[num_train:],
                       train_examples.features[num_train:, :],
                       feat_vectorizer=train_examples.feat_vectorizer,
                       subsets=subset_dict, label_col='Survived',
                       id_prefix='dev_example')

    # Read and write test examples
    test_examples = load_examples('test.csv', label_col='Survived',
                                   quiet=False, sparse=False)
    num_test = len(test_examples.classes)
    test_ids = list(range(num_train_dev + 1, num_test + num_train_dev + 1))
    write_feature_file('titanic/test/.csv', test_ids,
                       test_examples.classes, test_examples.features,
                       feat_vectorizer=test_examples.feat_vectorizer,
                       subsets=subset_dict, label_col='Survived',
                       id_prefix='test_example')