def main():
    p = argparse.ArgumentParser()

    p.add_argument('--train',
                   help='Name of training partition. "train" by default. This '
                   'should be the name of a directory in "../data/essays/"'
                   ' as well as "../data/features/"',
                   default='train')

    p.add_argument(
        '--test',
        help='Name of the testing partition. "dev" by default. This '
        'should be the name of a directory in "../data/essays/"'
        ' as well as "../data/features/"',
        default='dev')

    p.add_argument('--preprocessor',
                   help='Name of directory with processed essay files. '
                   '"tokenized" by default.',
                   default='tokenized')

    p.add_argument('--combined_training_features',
                   help='Path to file containing combined transcription and '
                   'ivector train features')

    p.add_argument('--combined_test_features',
                   help='Path to file containing combined transcription and '
                   'ivector test features')

    p.add_argument(
        '--transcription_training_features',
        help='Path to file containing precomputed training features.'
        ' None by default. Should be located in '
        '../data/features/<train_partition_name>/')

    p.add_argument('--transcription_test_features',
                   help='Path to file containing precomputed test features. '
                   'None by default. Should be located in '
                   '../data/features/<test_partition_name>/')

    p.add_argument(
        '--feature_outfile_name',
        help='Custom name, if desired, for output feature files '
        'to be written to '
        '../data/features/speech_with_ivectors/<train_partition_name>/ and '
        '../data.features/speech_with_ivectors/<test_partition_name>. '
        'If none provided, feature files will be named using '
        'the date and time. If precomputed feature files are '
        'provided, this argument will be ignored.')

    p.add_argument('--predictions_outfile_name',
                   help='Custom name, if desired, for predictions file to be '
                   'written to ../predictions/essays/. If none provided, '
                   'predictions file will be names using the date and '
                   'time.')

    args = p.parse_args()
    train_partition_name = args.train
    test_partition_name = args.test
    preprocessor = args.preprocessor
    combined_feature_file_train = args.combined_training_features
    combined_feature_file_test = args.combined_test_features
    transcription_feature_file_train = args.transcription_training_features
    transcription_feature_file_test = args.transcription_test_features
    feature_outfile_name = args.feature_outfile_name
    predictions_outfile_name = args.predictions_outfile_name

    #
    # Define Vectorizer and Trasformer
    #
    vectorizer = CountVectorizer(input="filename")
    transformer = Normalizer()  # Normalize frequencies to unit length

    #
    # Load the training and test features and labels
    #
    if not (combined_feature_file_train and combined_feature_file_test):
        transcription_data = get_features_and_labels(
            train_partition_name,
            test_partition_name,
            transcription_feature_file_train,
            transcription_feature_file_test,
            baseline='speech_transcriptions',
            preprocessor=preprocessor,
            vectorizer=vectorizer,
            transformer=transformer)

        transcription_train_matrix, encoded_train_labels, \
            original_training_labels = transcription_data[0]
        transcription_test_matrix, encoded_test_labels, \
            original_test_labels = transcription_data[1]
        print("Loaded transcription features.")

        ivectors_path = ('../data/features/ivectors/{partition}/'
                         'ivectors.json')
        train_path = ivectors_path.format(partition=train_partition_name)
        train_ivectors_dict = json.load(open(train_path))
        ivectors_train_matrix = ivectors_dict_to_features(
            train_ivectors_dict, train_partition_name, mat_format=csr_matrix)

        combined_train_features = combine_feature_matrices(
            transcription_train_matrix, ivectors_train_matrix, sparse=True)
        print("Combined transcription features with ivectors for {}".format(
            train_partition_name))

        test_path = ivectors_path.format(partition=test_partition_name)
        test_ivectors_dict = json.load(open(test_path))

        ivectors_test_matrix = ivectors_dict_to_features(test_ivectors_dict,
                                                         test_partition_name,
                                                         mat_format=csr_matrix)

        combined_test_features = combine_feature_matrices(
            transcription_test_matrix, ivectors_test_matrix, sparse=True)

        print("Combined transcription features with ivectors for {}".format(
            test_partition_name))

        #
        # Write speech transcription features to files if they do not yet exist
        #
        if not (transcription_feature_file_train
                and transcription_feature_file_test):
            write_feature_files(train_partition_name, feature_outfile_name,
                                'speech_transcriptions',
                                transcription_train_matrix,
                                encoded_train_labels)

            write_feature_files(test_partition_name, feature_outfile_name,
                                'speech_transcriptions',
                                transcription_test_matrix, encoded_test_labels)

        #
        # Write combined transcription + ivector features to file
        #
        write_feature_files(train_partition_name, feature_outfile_name,
                            BASELINE, combined_train_features,
                            encoded_train_labels)
        write_feature_files(test_partition_name, feature_outfile_name,
                            BASELINE, combined_test_features,
                            encoded_test_labels)

    else:
        combined_train_test_data = get_features_and_labels(
            train_partition_name,
            test_partition_name,
            combined_feature_file_train,
            combined_feature_file_test,
            baseline=BASELINE)
        combined_train_features, encoded_train_labels, \
            original_training_labels = combined_train_test_data[0]
        combined_test_features, encoded_test_labels, \
            original_test_labels = combined_train_test_data[1]
    #
    # Train classifier and predict
    #
    clf = LinearSVC()
    print("Training the classifier...")
    clf.fit(combined_train_features, encoded_train_labels)  # Linear kernel SVM
    predicted = clf.predict(combined_test_features)

    #
    # Write predictions and display report
    #
    write_predictions_file(predicted, test_partition_name,
                           predictions_outfile_name, BASELINE)
    display_classification_results(encoded_test_labels, predicted)
Esempio n. 2
0
def main():
    p = argparse.ArgumentParser()

    p.add_argument(
        '--train',
        help=
        'Name of training partition. "train" by default. This should be the name of a directory '
        'in "../data/essays/" as well as "../data/features/"',
        default='train')
    p.add_argument(
        '--test',
        help=
        'Name of the testing partition. "dev" by default. This should be the name of a directory '
        'in "../data/essays/" as well as "../data/features/"',
        default='dev')
    p.add_argument(
        '--preprocessor',
        help=
        'Name of directory with processed essay files. "tokenized" by default.',
        default='tokenized')
    p.add_argument(
        '--training_features',
        help=
        'Path to file containing precomputed training features. None by default. '
        'Should be located in ../data/features/<train_partition_name>/')
    p.add_argument(
        '--test_features',
        help=
        'Path to file containing precomputed test features. None by default.'
        'Should be located in ../data/features/<test_partition_name>/')
    p.add_argument(
        '--feature_outfile_name',
        help=
        'Custom name, if desired, for output feature files to be written to '
        '../data/features/essays/<train_partition_name>/ and '
        '../data.features/essays/<test_partition_name>. '
        'If none provided, feature files will be named using the date and time.'
        'If precomputed feature files are provided, this argument will be ignored.'
    )
    p.add_argument(
        '--predictions_outfile_name',
        help=
        'Custom name, if desired, for predictions file to be written to ../predictions/essays/.'
        'If none provided, predictions file will be names using the date and time.'
    )
    args = p.parse_args()
    train_partition_name = args.train
    test_partition_name = args.test
    preprocessor = args.preprocessor
    feature_file_train = args.training_features
    feature_file_test = args.test_features
    feature_outfile_name = args.feature_outfile_name
    predictions_outfile_name = args.predictions_outfile_name

    #
    # Define Vectorizer and Transformer
    #
    vectorizer = CountVectorizer(input="filename")
    transformer = Normalizer()  # Normalize frequencies to unit length

    #
    # Load the training and test features and labels
    #
    training_and_test_data = get_features_and_labels(train_partition_name,
                                                     test_partition_name,
                                                     feature_file_train,
                                                     feature_file_test,
                                                     baseline=BASELINE,
                                                     preprocessor=preprocessor,
                                                     vectorizer=vectorizer,
                                                     transformer=transformer)

    train_matrix, encoded_train_labels, original_training_labels = training_and_test_data[
        0]
    test_matrix, encoded_test_labels, original_test_labels = training_and_test_data[
        1]

    #
    # Write features to feature files if they are new
    #
    if not (feature_file_train and feature_file_test):
        write_feature_files(train_partition_name, feature_outfile_name,
                            BASELINE, train_matrix, encoded_train_labels)
        write_feature_files(test_partition_name, feature_outfile_name,
                            BASELINE, test_matrix, encoded_test_labels)

    #
    # Run the classifier
    #
    clf = LinearSVC()
    print("Training the classifier...")
    clf.fit(train_matrix, encoded_train_labels)  # Linear kernel SVM
    predicted = clf.predict(test_matrix)

    #
    # Write predictions and display report
    #
    write_predictions_file(predicted, test_partition_name,
                           predictions_outfile_name, BASELINE)
    display_classification_results(encoded_test_labels, predicted)
Esempio n. 3
0
def main():
    p = argparse.ArgumentParser()

    p.add_argument('--train',
                   help='Name of training partition. "train" by default. This '
                   'should be the name of a directory in "../data/essays/"'
                   ' as well as "../data/features/"',
                   default='train')

    p.add_argument('--test',
                   help='Name of the testing partition. "dev" by default. This'
                   ' should be the name of a directory in '
                   '"../data/essays/" as well as "../data/features/"',
                   default='dev')

    p.add_argument('--preprocessor',
                   help='Name of directory with processed essay files. '
                   '"tokenized" by default.',
                   default='tokenized')

    p.add_argument(
        '--essay_training_features',
        help='Path to file containing precomputed training features.'
        ' None by default. Should be located in '
        '../data/features/essays/<train_partition_name>/')

    p.add_argument('--essay_test_features',
                   help='Path to file containing precomputed test features. '
                   'None by default. Should be located in '
                   '../data/features/essays/<test_partition_name>/')

    p.add_argument(
        '--transcription_training_features',
        help='Path to file containing precomputed training features.'
        ' None by default. Should be located in '
        '../data/features/speech_transcriptions'
        '/<train_partition_name>/')

    p.add_argument('--transcription_test_features',
                   help='Path to file containing precomputed test features. '
                   'None by default. Should be located in '
                   '../data/features/speech_transcriptions'
                   '/<test_partition_name>/')

    p.add_argument('--combined_training_features',
                   help='Path to file containing precomputed combined training'
                   ' features. Should be located in ../data/features'
                   '/fusion/<train_partition_name>')

    p.add_argument('--combined_test_features',
                   help='Path to file containing precomputed combined test '
                   'features. Should be located in '
                   '../data/features/fusion/<test_partition_name>')

    p.add_argument('--feature_outfile_name',
                   help='Custom name, if desired, for output feature files to '
                   'be written to '
                   '../data/features/essays/<train_partition_name>/ and '
                   '../data.features/essays/<test_partition_name>. '
                   'If none provided, feature files will be named using '
                   'the date and time. If precomputed feature files are '
                   'provided, this argument will be ignored.')

    p.add_argument('--predictions_outfile_name',
                   help='Custom name, if desired, for predictions file to be '
                   'written to ../predictions/essays/. If none provided, '
                   'predictions file will be names using the date and '
                   'time.')

    args = p.parse_args()
    training_partition_name = args.train
    test_partition_name = args.test
    preprocessor = args.preprocessor
    essay_train_feature_file = args.essay_training_features
    essay_test_feature_file = args.essay_test_features
    transcription_train_feature_file = args.transcription_training_features
    transcription_test_feature_file = args.transcription_test_features
    combined_train_feature_file = args.combined_training_features
    combined_test_feature_file = args.combined_test_features
    feature_outfile_name = args.feature_outfile_name
    predictions_outfile_name = args.predictions_outfile_name

    #
    # Define Vectorizers and Transformers for both essay data and
    # speech_transcriptions. These will be ignored if you provide paths to
    # pre-computed feature files.
    #
    essay_vectorizer, essay_transformer = CountVectorizer(input="filename"), \
                                          Normalizer()
    speech_vectorizer, speech_transformer = CountVectorizer(input="filename"), \
                                            Normalizer()

    if not (combined_train_feature_file and combined_test_feature_file):
        #
        # Get essay features.
        #
        essay_train_and_test_data = get_features_and_labels(
            training_partition_name,
            test_partition_name,
            essay_train_feature_file,
            essay_test_feature_file,
            baseline='essays',
            preprocessor=preprocessor,
            vectorizer=essay_vectorizer,
            transformer=essay_transformer)

        essay_train_matrix, \
        essay_encoded_train_labels, \
        essay_original_train_labels = essay_train_and_test_data[0]

        essay_test_matrix, \
        essay_encoded_test_labels, \
        essay_original_test_labels = essay_train_and_test_data[1]

        print("Retrieved essay features.")

        #
        # Get speech features.
        #
        speech_train_and_test_data = get_features_and_labels(
            training_partition_name,
            test_partition_name,
            transcription_train_feature_file,
            transcription_test_feature_file,
            baseline='speech_transcriptions',
            preprocessor=preprocessor,
            vectorizer=speech_vectorizer,
            transformer=speech_transformer)

        speech_train_matrix, \
        speech_encoded_train_labels, \
        speech_original_train_labels = speech_train_and_test_data[0]

        speech_test_matrix, \
        speech_encoded_test_labels, \
        speech_original_test_labels = speech_train_and_test_data[1]

        print("Retrieved speech transcription features.")

        assert (speech_original_train_labels == essay_original_train_labels)
        assert (speech_original_test_labels == speech_original_test_labels)

        #
        # Concatenate (horizontally stack) essay and speech feature matrices.
        #
        combined_train_matrix = combine_feature_matrices(essay_train_matrix,
                                                         speech_train_matrix,
                                                         sparse=True)
        print("Finished combining essay and speech transcription "
              "train matrices.")
        combined_test_matrix = combine_feature_matrices(essay_test_matrix,
                                                        speech_test_matrix,
                                                        sparse=True)
        print("Finished combining essay and speech transcription "
              "test matrices.")
        assert (
            combined_train_matrix.shape[1] == combined_test_matrix.shape[1])

        combined_encoded_train_labels = essay_encoded_train_labels
        combined_encoded_test_labels = essay_encoded_test_labels

        #
        # Write feature files if not provided.
        #
        if not (essay_train_feature_file and essay_test_feature_file):
            write_feature_files(training_partition_name, feature_outfile_name,
                                'essays', essay_train_matrix,
                                essay_encoded_train_labels)

            write_feature_files(test_partition_name, feature_outfile_name,
                                'essays', essay_test_matrix,
                                essay_encoded_test_labels)

        if not (transcription_train_feature_file
                and transcription_test_feature_file):
            write_feature_files(training_partition_name, feature_outfile_name,
                                'speech_transcriptions', speech_train_matrix,
                                speech_encoded_train_labels)

            write_feature_files(test_partition_name, feature_outfile_name,
                                'speech_transcriptions', speech_test_matrix,
                                speech_encoded_test_labels)

        write_feature_files(training_partition_name, feature_outfile_name,
                            BASELINE, combined_train_matrix,
                            speech_encoded_train_labels)

        write_feature_files(test_partition_name, feature_outfile_name,
                            BASELINE, combined_test_matrix,
                            speech_encoded_test_labels)

    else:
        combined_train_and_test_data = get_features_and_labels(
            training_partition_name, test_partition_name,
            combined_train_feature_file, combined_test_feature_file)

        combined_train_matrix, \
        combined_encoded_train_labels, \
        combined_original_train_labels = combined_train_and_test_data[0]

        combined_test_matrix, \
        combined_encoded_test_labels, \
        combined_original_test_labels = combined_train_and_test_data[1]

    #
    # Train classifier, make predictions, and display results.
    #
    clf = LinearSVC()
    clf.fit(combined_train_matrix, combined_encoded_train_labels)
    predictions = clf.predict(combined_test_matrix)

    write_predictions_file(predictions, test_partition_name,
                           predictions_outfile_name, BASELINE)
    display_classification_results(combined_encoded_test_labels, predictions)