def main(): p = argparse.ArgumentParser() p.add_argument('--train', help='Name of training partition. "train" by default. This ' 'should be the name of a directory in "../data/essays/"' ' as well as "../data/features/"', default='train') p.add_argument( '--test', help='Name of the testing partition. "dev" by default. This ' 'should be the name of a directory in "../data/essays/"' ' as well as "../data/features/"', default='dev') p.add_argument('--preprocessor', help='Name of directory with processed essay files. ' '"tokenized" by default.', default='tokenized') p.add_argument('--combined_training_features', help='Path to file containing combined transcription and ' 'ivector train features') p.add_argument('--combined_test_features', help='Path to file containing combined transcription and ' 'ivector test features') p.add_argument( '--transcription_training_features', help='Path to file containing precomputed training features.' ' None by default. Should be located in ' '../data/features/<train_partition_name>/') p.add_argument('--transcription_test_features', help='Path to file containing precomputed test features. ' 'None by default. Should be located in ' '../data/features/<test_partition_name>/') p.add_argument( '--feature_outfile_name', help='Custom name, if desired, for output feature files ' 'to be written to ' '../data/features/speech_with_ivectors/<train_partition_name>/ and ' '../data.features/speech_with_ivectors/<test_partition_name>. ' 'If none provided, feature files will be named using ' 'the date and time. If precomputed feature files are ' 'provided, this argument will be ignored.') p.add_argument('--predictions_outfile_name', help='Custom name, if desired, for predictions file to be ' 'written to ../predictions/essays/. If none provided, ' 'predictions file will be names using the date and ' 'time.') args = p.parse_args() train_partition_name = args.train test_partition_name = args.test preprocessor = args.preprocessor combined_feature_file_train = args.combined_training_features combined_feature_file_test = args.combined_test_features transcription_feature_file_train = args.transcription_training_features transcription_feature_file_test = args.transcription_test_features feature_outfile_name = args.feature_outfile_name predictions_outfile_name = args.predictions_outfile_name # # Define Vectorizer and Trasformer # vectorizer = CountVectorizer(input="filename") transformer = Normalizer() # Normalize frequencies to unit length # # Load the training and test features and labels # if not (combined_feature_file_train and combined_feature_file_test): transcription_data = get_features_and_labels( train_partition_name, test_partition_name, transcription_feature_file_train, transcription_feature_file_test, baseline='speech_transcriptions', preprocessor=preprocessor, vectorizer=vectorizer, transformer=transformer) transcription_train_matrix, encoded_train_labels, \ original_training_labels = transcription_data[0] transcription_test_matrix, encoded_test_labels, \ original_test_labels = transcription_data[1] print("Loaded transcription features.") ivectors_path = ('../data/features/ivectors/{partition}/' 'ivectors.json') train_path = ivectors_path.format(partition=train_partition_name) train_ivectors_dict = json.load(open(train_path)) ivectors_train_matrix = ivectors_dict_to_features( train_ivectors_dict, train_partition_name, mat_format=csr_matrix) combined_train_features = combine_feature_matrices( transcription_train_matrix, ivectors_train_matrix, sparse=True) print("Combined transcription features with ivectors for {}".format( train_partition_name)) test_path = ivectors_path.format(partition=test_partition_name) test_ivectors_dict = json.load(open(test_path)) ivectors_test_matrix = ivectors_dict_to_features(test_ivectors_dict, test_partition_name, mat_format=csr_matrix) combined_test_features = combine_feature_matrices( transcription_test_matrix, ivectors_test_matrix, sparse=True) print("Combined transcription features with ivectors for {}".format( test_partition_name)) # # Write speech transcription features to files if they do not yet exist # if not (transcription_feature_file_train and transcription_feature_file_test): write_feature_files(train_partition_name, feature_outfile_name, 'speech_transcriptions', transcription_train_matrix, encoded_train_labels) write_feature_files(test_partition_name, feature_outfile_name, 'speech_transcriptions', transcription_test_matrix, encoded_test_labels) # # Write combined transcription + ivector features to file # write_feature_files(train_partition_name, feature_outfile_name, BASELINE, combined_train_features, encoded_train_labels) write_feature_files(test_partition_name, feature_outfile_name, BASELINE, combined_test_features, encoded_test_labels) else: combined_train_test_data = get_features_and_labels( train_partition_name, test_partition_name, combined_feature_file_train, combined_feature_file_test, baseline=BASELINE) combined_train_features, encoded_train_labels, \ original_training_labels = combined_train_test_data[0] combined_test_features, encoded_test_labels, \ original_test_labels = combined_train_test_data[1] # # Train classifier and predict # clf = LinearSVC() print("Training the classifier...") clf.fit(combined_train_features, encoded_train_labels) # Linear kernel SVM predicted = clf.predict(combined_test_features) # # Write predictions and display report # write_predictions_file(predicted, test_partition_name, predictions_outfile_name, BASELINE) display_classification_results(encoded_test_labels, predicted)
def main(): p = argparse.ArgumentParser() p.add_argument( '--train', help= 'Name of training partition. "train" by default. This should be the name of a directory ' 'in "../data/essays/" as well as "../data/features/"', default='train') p.add_argument( '--test', help= 'Name of the testing partition. "dev" by default. This should be the name of a directory ' 'in "../data/essays/" as well as "../data/features/"', default='dev') p.add_argument( '--preprocessor', help= 'Name of directory with processed essay files. "tokenized" by default.', default='tokenized') p.add_argument( '--training_features', help= 'Path to file containing precomputed training features. None by default. ' 'Should be located in ../data/features/<train_partition_name>/') p.add_argument( '--test_features', help= 'Path to file containing precomputed test features. None by default.' 'Should be located in ../data/features/<test_partition_name>/') p.add_argument( '--feature_outfile_name', help= 'Custom name, if desired, for output feature files to be written to ' '../data/features/essays/<train_partition_name>/ and ' '../data.features/essays/<test_partition_name>. ' 'If none provided, feature files will be named using the date and time.' 'If precomputed feature files are provided, this argument will be ignored.' ) p.add_argument( '--predictions_outfile_name', help= 'Custom name, if desired, for predictions file to be written to ../predictions/essays/.' 'If none provided, predictions file will be names using the date and time.' ) args = p.parse_args() train_partition_name = args.train test_partition_name = args.test preprocessor = args.preprocessor feature_file_train = args.training_features feature_file_test = args.test_features feature_outfile_name = args.feature_outfile_name predictions_outfile_name = args.predictions_outfile_name # # Define Vectorizer and Transformer # vectorizer = CountVectorizer(input="filename") transformer = Normalizer() # Normalize frequencies to unit length # # Load the training and test features and labels # training_and_test_data = get_features_and_labels(train_partition_name, test_partition_name, feature_file_train, feature_file_test, baseline=BASELINE, preprocessor=preprocessor, vectorizer=vectorizer, transformer=transformer) train_matrix, encoded_train_labels, original_training_labels = training_and_test_data[ 0] test_matrix, encoded_test_labels, original_test_labels = training_and_test_data[ 1] # # Write features to feature files if they are new # if not (feature_file_train and feature_file_test): write_feature_files(train_partition_name, feature_outfile_name, BASELINE, train_matrix, encoded_train_labels) write_feature_files(test_partition_name, feature_outfile_name, BASELINE, test_matrix, encoded_test_labels) # # Run the classifier # clf = LinearSVC() print("Training the classifier...") clf.fit(train_matrix, encoded_train_labels) # Linear kernel SVM predicted = clf.predict(test_matrix) # # Write predictions and display report # write_predictions_file(predicted, test_partition_name, predictions_outfile_name, BASELINE) display_classification_results(encoded_test_labels, predicted)
def main(): p = argparse.ArgumentParser() p.add_argument('--train', help='Name of training partition. "train" by default. This ' 'should be the name of a directory in "../data/essays/"' ' as well as "../data/features/"', default='train') p.add_argument('--test', help='Name of the testing partition. "dev" by default. This' ' should be the name of a directory in ' '"../data/essays/" as well as "../data/features/"', default='dev') p.add_argument('--preprocessor', help='Name of directory with processed essay files. ' '"tokenized" by default.', default='tokenized') p.add_argument( '--essay_training_features', help='Path to file containing precomputed training features.' ' None by default. Should be located in ' '../data/features/essays/<train_partition_name>/') p.add_argument('--essay_test_features', help='Path to file containing precomputed test features. ' 'None by default. Should be located in ' '../data/features/essays/<test_partition_name>/') p.add_argument( '--transcription_training_features', help='Path to file containing precomputed training features.' ' None by default. Should be located in ' '../data/features/speech_transcriptions' '/<train_partition_name>/') p.add_argument('--transcription_test_features', help='Path to file containing precomputed test features. ' 'None by default. Should be located in ' '../data/features/speech_transcriptions' '/<test_partition_name>/') p.add_argument('--combined_training_features', help='Path to file containing precomputed combined training' ' features. Should be located in ../data/features' '/fusion/<train_partition_name>') p.add_argument('--combined_test_features', help='Path to file containing precomputed combined test ' 'features. Should be located in ' '../data/features/fusion/<test_partition_name>') p.add_argument('--feature_outfile_name', help='Custom name, if desired, for output feature files to ' 'be written to ' '../data/features/essays/<train_partition_name>/ and ' '../data.features/essays/<test_partition_name>. ' 'If none provided, feature files will be named using ' 'the date and time. If precomputed feature files are ' 'provided, this argument will be ignored.') p.add_argument('--predictions_outfile_name', help='Custom name, if desired, for predictions file to be ' 'written to ../predictions/essays/. If none provided, ' 'predictions file will be names using the date and ' 'time.') args = p.parse_args() training_partition_name = args.train test_partition_name = args.test preprocessor = args.preprocessor essay_train_feature_file = args.essay_training_features essay_test_feature_file = args.essay_test_features transcription_train_feature_file = args.transcription_training_features transcription_test_feature_file = args.transcription_test_features combined_train_feature_file = args.combined_training_features combined_test_feature_file = args.combined_test_features feature_outfile_name = args.feature_outfile_name predictions_outfile_name = args.predictions_outfile_name # # Define Vectorizers and Transformers for both essay data and # speech_transcriptions. These will be ignored if you provide paths to # pre-computed feature files. # essay_vectorizer, essay_transformer = CountVectorizer(input="filename"), \ Normalizer() speech_vectorizer, speech_transformer = CountVectorizer(input="filename"), \ Normalizer() if not (combined_train_feature_file and combined_test_feature_file): # # Get essay features. # essay_train_and_test_data = get_features_and_labels( training_partition_name, test_partition_name, essay_train_feature_file, essay_test_feature_file, baseline='essays', preprocessor=preprocessor, vectorizer=essay_vectorizer, transformer=essay_transformer) essay_train_matrix, \ essay_encoded_train_labels, \ essay_original_train_labels = essay_train_and_test_data[0] essay_test_matrix, \ essay_encoded_test_labels, \ essay_original_test_labels = essay_train_and_test_data[1] print("Retrieved essay features.") # # Get speech features. # speech_train_and_test_data = get_features_and_labels( training_partition_name, test_partition_name, transcription_train_feature_file, transcription_test_feature_file, baseline='speech_transcriptions', preprocessor=preprocessor, vectorizer=speech_vectorizer, transformer=speech_transformer) speech_train_matrix, \ speech_encoded_train_labels, \ speech_original_train_labels = speech_train_and_test_data[0] speech_test_matrix, \ speech_encoded_test_labels, \ speech_original_test_labels = speech_train_and_test_data[1] print("Retrieved speech transcription features.") assert (speech_original_train_labels == essay_original_train_labels) assert (speech_original_test_labels == speech_original_test_labels) # # Concatenate (horizontally stack) essay and speech feature matrices. # combined_train_matrix = combine_feature_matrices(essay_train_matrix, speech_train_matrix, sparse=True) print("Finished combining essay and speech transcription " "train matrices.") combined_test_matrix = combine_feature_matrices(essay_test_matrix, speech_test_matrix, sparse=True) print("Finished combining essay and speech transcription " "test matrices.") assert ( combined_train_matrix.shape[1] == combined_test_matrix.shape[1]) combined_encoded_train_labels = essay_encoded_train_labels combined_encoded_test_labels = essay_encoded_test_labels # # Write feature files if not provided. # if not (essay_train_feature_file and essay_test_feature_file): write_feature_files(training_partition_name, feature_outfile_name, 'essays', essay_train_matrix, essay_encoded_train_labels) write_feature_files(test_partition_name, feature_outfile_name, 'essays', essay_test_matrix, essay_encoded_test_labels) if not (transcription_train_feature_file and transcription_test_feature_file): write_feature_files(training_partition_name, feature_outfile_name, 'speech_transcriptions', speech_train_matrix, speech_encoded_train_labels) write_feature_files(test_partition_name, feature_outfile_name, 'speech_transcriptions', speech_test_matrix, speech_encoded_test_labels) write_feature_files(training_partition_name, feature_outfile_name, BASELINE, combined_train_matrix, speech_encoded_train_labels) write_feature_files(test_partition_name, feature_outfile_name, BASELINE, combined_test_matrix, speech_encoded_test_labels) else: combined_train_and_test_data = get_features_and_labels( training_partition_name, test_partition_name, combined_train_feature_file, combined_test_feature_file) combined_train_matrix, \ combined_encoded_train_labels, \ combined_original_train_labels = combined_train_and_test_data[0] combined_test_matrix, \ combined_encoded_test_labels, \ combined_original_test_labels = combined_train_and_test_data[1] # # Train classifier, make predictions, and display results. # clf = LinearSVC() clf.fit(combined_train_matrix, combined_encoded_train_labels) predictions = clf.predict(combined_test_matrix) write_predictions_file(predictions, test_partition_name, predictions_outfile_name, BASELINE) display_classification_results(combined_encoded_test_labels, predictions)