for category_dir in listdir(path): # Build the dataset of (docname, category, wordcounts) tuples distinct_labels[category_dir] = curr_cat curr_cat += 1 next_docs = sc.wholeTextFiles(('/').join([path, category_dir])) docs = docs.union(next_docs.map(lambda (doc, lines): (doc, category_dir, wordcounts(lines, True, text_filter)))) # Generate the list of all unique words in sorted order: all_words = sorted(docs.flatMap(lambda (doc, cat, counts): counts).map(lambda (x, y): x).distinct().collect()) # Build a word-index map for use in making the sparse matrix: word_index_map = dict(map(lambda x: (all_words[x - 1], x), range(1, len(all_words) + 1))) # Generate the document-term matrix (dense format) if do_dense: dense_row_gen = lambda (doc, cat, counts): build_dense_row_string(distinct_labels[cat], all_words, counts) dense_matrix = docs.map(dense_row_gen).reduce(lambda x, y: x + '\n' + y) write_file(dense_matrix, output_dir + dense_matrix_filename) # Generate the document-term matrix (sparse format) if do_sparse: sparse_row_gen = lambda (doc, cat, counts): build_sparse_row_string(distinct_labels[cat], word_index_map, counts) sparse_matrix = docs.map(sparse_row_gen).reduce(lambda x, y: x + '\n' + y) write_file(sparse_matrix, output_dir + sparse_matrix_filename) # Write output files write_file('\n'.join(all_words), output_dir + words_filename) write_file('\n'.join(map(lambda (x, y): str(x) + ',' + str(y), distinct_labels.items())), output_dir + labels_filename) sc.stop()
# Generate the list of all unique words in sorted order: all_words = sorted( docs.flatMap(lambda (doc, cat, counts): counts).map( lambda (x, y): x).distinct().collect()) # Build a word-index map for use in making the sparse matrix: word_index_map = dict( map(lambda x: (all_words[x - 1], x), range(1, len(all_words) + 1))) # Generate the document-term matrix (dense format) if do_dense: dense_row_gen = lambda (doc, cat, counts): build_dense_row_string( distinct_labels[cat], all_words, counts) dense_matrix = docs.map(dense_row_gen).reduce(lambda x, y: x + '\n' + y) write_file(dense_matrix, output_dir + dense_matrix_filename) # Generate the document-term matrix (sparse format) if do_sparse: sparse_row_gen = lambda (doc, cat, counts): build_sparse_row_string( distinct_labels[cat], word_index_map, counts) sparse_matrix = docs.map(sparse_row_gen).reduce(lambda x, y: x + '\n' + y) write_file(sparse_matrix, output_dir + sparse_matrix_filename) # Write output files write_file('\n'.join(all_words), output_dir + words_filename) write_file( '\n'.join( map(lambda (x, y): str(x) + ',' + str(y), distinct_labels.items())), output_dir + labels_filename)
#pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr]) #m0 = p0.fit(train) #pipeline = Pipeline(stages=[m0, lr]) pipeline = p0 # Fit the pipeline to training documents. model = pipeline.fit(train) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("label", "prediction").collect() #for row in selected.collect(): # print((row.label, row.prediction)) actual = map(lambda row: distinct_labels[row.label], selected) predicted = map(lambda row: distinct_labels[row.prediction], selected) print('---------- CONFUSION MATRIX ------------------------\n\n') confusion_matrix(predicted, actual) print('\n\n-----------------------------------------------------\n\n') # TODO: Incorporate confusion matrix and diagnostics if predict_dir != None: pred_docs = sc.wholeTextFiles(predict_dir).map(lambda (id, lines): (id, format_text(lines))) UnlabeledDocument = Row("id", "text") for_pred = pred_docs.map(lambda x: UnlabeledDocument(*x)).toDF() predictions = model.transform(for_pred).select("id", "prediction").collect() str = reduce(lambda y,z: y+z, map(lambda x: '(' + x.id + ', ' + distinct_labels[x.prediction] + ')\n', predictions)) write_file(str, output_file) sc.stop()
# Fit the pipeline to training documents. model = pipeline.fit(train) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) selected = prediction.select("label", "prediction").collect() #for row in selected.collect(): # print((row.label, row.prediction)) actual = map(lambda row: distinct_labels[row.label], selected) predicted = map(lambda row: distinct_labels[row.prediction], selected) print('---------- CONFUSION MATRIX ------------------------\n\n') confusion_matrix(predicted, actual) print('\n\n-----------------------------------------------------\n\n') # TODO: Incorporate confusion matrix and diagnostics if predict_dir != None: pred_docs = sc.wholeTextFiles(predict_dir).map( lambda (id, lines): (id, format_text(lines))) UnlabeledDocument = Row("id", "text") for_pred = pred_docs.map(lambda x: UnlabeledDocument(*x)).toDF() predictions = model.transform(for_pred).select("id", "prediction").collect() str = reduce( lambda y, z: y + z, map( lambda x: '(' + x.id + ', ' + distinct_labels[x.prediction] + ')\n', predictions)) write_file(str, output_file) sc.stop()