for category_dir in listdir(path): # Build the dataset of (docname, category, wordcounts) tuples
	distinct_labels[category_dir] = curr_cat
	curr_cat += 1
	next_docs = sc.wholeTextFiles(('/').join([path, category_dir]))
	docs = docs.union(next_docs.map(lambda (doc, lines): (doc, category_dir, wordcounts(lines, True, text_filter))))

# Generate the list of all unique words in sorted order:
all_words = sorted(docs.flatMap(lambda (doc, cat, counts): counts).map(lambda (x, y): x).distinct().collect())

# Build a word-index map for use in making the sparse matrix:
word_index_map = dict(map(lambda x: (all_words[x - 1], x), range(1, len(all_words) + 1)))

# Generate the document-term matrix (dense format)
if do_dense:
	dense_row_gen = lambda (doc, cat, counts): build_dense_row_string(distinct_labels[cat], all_words, counts)
	dense_matrix = docs.map(dense_row_gen).reduce(lambda x, y: x + '\n' + y)
	write_file(dense_matrix, output_dir + dense_matrix_filename)
	
# Generate the document-term matrix (sparse format)
if do_sparse:
	sparse_row_gen = lambda (doc, cat, counts): build_sparse_row_string(distinct_labels[cat], word_index_map, counts)
	sparse_matrix = docs.map(sparse_row_gen).reduce(lambda x, y: x + '\n' + y)
	write_file(sparse_matrix, output_dir + sparse_matrix_filename)
	
# Write output files
write_file('\n'.join(all_words), output_dir + words_filename)
write_file('\n'.join(map(lambda (x, y): str(x) + ',' + str(y), distinct_labels.items())), output_dir + labels_filename)
	
sc.stop()

# Generate the list of all unique words in sorted order:
all_words = sorted(
    docs.flatMap(lambda (doc, cat, counts): counts).map(
        lambda (x, y): x).distinct().collect())

# Build a word-index map for use in making the sparse matrix:
word_index_map = dict(
    map(lambda x: (all_words[x - 1], x), range(1,
                                               len(all_words) + 1)))

# Generate the document-term matrix (dense format)
if do_dense:
    dense_row_gen = lambda (doc, cat, counts): build_dense_row_string(
        distinct_labels[cat], all_words, counts)
    dense_matrix = docs.map(dense_row_gen).reduce(lambda x, y: x + '\n' + y)
    write_file(dense_matrix, output_dir + dense_matrix_filename)

# Generate the document-term matrix (sparse format)
if do_sparse:
    sparse_row_gen = lambda (doc, cat, counts): build_sparse_row_string(
        distinct_labels[cat], word_index_map, counts)
    sparse_matrix = docs.map(sparse_row_gen).reduce(lambda x, y: x + '\n' + y)
    write_file(sparse_matrix, output_dir + sparse_matrix_filename)

# Write output files
write_file('\n'.join(all_words), output_dir + words_filename)
write_file(
    '\n'.join(
        map(lambda (x, y): str(x) + ',' + str(y), distinct_labels.items())),
    output_dir + labels_filename)
	
	#pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
	p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr])
	#m0 = p0.fit(train)
	#pipeline = Pipeline(stages=[m0, lr])
	pipeline = p0
	
	# Fit the pipeline to training documents.
	model = pipeline.fit(train)

    # Make predictions on test documents and print columns of interest.
	prediction = model.transform(test)
	selected = prediction.select("label", "prediction").collect() 
	#for row in selected.collect():
	#	print((row.label, row.prediction))
	actual = map(lambda row: distinct_labels[row.label], selected)
	predicted = map(lambda row: distinct_labels[row.prediction], selected)
	print('---------- CONFUSION MATRIX ------------------------\n\n')
	confusion_matrix(predicted, actual)
	print('\n\n-----------------------------------------------------\n\n')
	# TODO: Incorporate confusion matrix and diagnostics
	
	if predict_dir != None:
		pred_docs = sc.wholeTextFiles(predict_dir).map(lambda (id, lines): (id, format_text(lines)))
		UnlabeledDocument = Row("id", "text")
		for_pred = pred_docs.map(lambda x: UnlabeledDocument(*x)).toDF()
		predictions = model.transform(for_pred).select("id", "prediction").collect()
		str = reduce(lambda y,z: y+z, map(lambda x: '(' + x.id + ', ' + distinct_labels[x.prediction] + ')\n', predictions))
		write_file(str, output_file)
	sc.stop()
Exemple #4
0
    # Fit the pipeline to training documents.
    model = pipeline.fit(train)

    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(test)
    selected = prediction.select("label", "prediction").collect()
    #for row in selected.collect():
    #	print((row.label, row.prediction))
    actual = map(lambda row: distinct_labels[row.label], selected)
    predicted = map(lambda row: distinct_labels[row.prediction], selected)
    print('---------- CONFUSION MATRIX ------------------------\n\n')
    confusion_matrix(predicted, actual)
    print('\n\n-----------------------------------------------------\n\n')
    # TODO: Incorporate confusion matrix and diagnostics

    if predict_dir != None:
        pred_docs = sc.wholeTextFiles(predict_dir).map(
            lambda (id, lines): (id, format_text(lines)))
        UnlabeledDocument = Row("id", "text")
        for_pred = pred_docs.map(lambda x: UnlabeledDocument(*x)).toDF()
        predictions = model.transform(for_pred).select("id",
                                                       "prediction").collect()
        str = reduce(
            lambda y, z: y + z,
            map(
                lambda x: '(' + x.id + ', ' + distinct_labels[x.prediction] +
                ')\n', predictions))
        write_file(str, output_file)
    sc.stop()