def clusterDocument(title, abstract, tokenizer, token2indexMap, tfidf_model, lsi_model, gmm_model): tokens = tokenize(title + " " + abstract, tokenizer) tokenCounts = groupAndCount(tokens) matrix = build_csr_matrix(listOfMaps=[tokenCounts], token2IndexMap=token2indexMap) transformedMatrix = lsi_model.transform(tfidf_model.transform(matrix)) prediction = gmm_model.predict(transformedMatrix)[0] return prediction
mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz") train_mat = mat[train_doc_ind, :] train_labels = itemgetter(*train_doc_ind)(ordered_document_labels) svd = TruncatedSVD(n_components=1000) svd.fit(train_mat) test_mat = mat[test_doc_ind, :] test_labels = itemgetter(*test_doc_ind)(ordered_document_labels) clf = svm.LinearSVC() clf.fit(svd.transform(train_mat), train_labels) # eval results predictions = clf.predict(svd.transform(test_mat)).tolist() def f(pred_label_pair): if pred_label_pair[0] == 1 and pred_label_pair[1] == 1: return "tp" elif pred_label_pair[0] == 1 and pred_label_pair[1] == 0: return "fp" elif pred_label_pair[0] == 0 and pred_label_pair[1] == 1: return "fn" else: return "tn" cats = map(lambda x: f(x), zip(predictions, test_labels)) print groupAndCount(cats)