def get_normed_author_theorem_matrix(setting): if (not force_gen and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz") and os.path.isfile("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz")): normed_author_theorem_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix.npz") normed_theorem_author_mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix.npz") return normed_author_theorem_mat, normed_theorem_author_mat else: mat, r, c = get_author_theorem_matrix(setting) normed_author_theorem_mat = normalize(mat) normed_theorem_author_mat = normalize(mat.transpose()) save_csr_matrix(normed_author_theorem_mat, "derived_data/" + setting_string(**setting) + "__normed_author_theorem_matrix") save_csr_matrix(normed_theorem_author_mat, "derived_data/" + setting_string(**setting) + "__normed_theorem_author_matrix") return normed_author_theorem_mat, normed_theorem_author_mat
def get_raw_tdm(setting): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__raw_tdm.npz", "derived_data/" + setting_string(**setting) + "__ids"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_tdm.npz") ids = [] with open("derived_data/" + setting_string(**setting) + "__ids") as f: count = 0 for line in f: x = line.split(";") if setting['granularity'] == 'paragraphs': ids.append((count, (x[0], x[1].strip()))) elif setting['granularity'] == 'documents': ids.append((count, x[0].strip())) else: raise ValueError("granularity must be either 'documents' or 'paragraphs'") row2id_map = dict(ids) token2index_map = get_token2index_map(setting) column2token_map = {index: token for token, index in token2index_map.items()} return mat, row2id_map, column2token_map else: token2index_map = get_token2index_map(setting) column2token_map = {index: token for token, index in token2index_map.items()} if setting['granularity'] == "paragraphs": paragraph_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis']) mat, id_log = build_raw_csr_matrix(paragraph_generator, token2index_map) elif setting['granularity'] == "documents": document_generator = get_all_documents_as_feature_map(setting['token_method'], setting['data_basis']) mat, id_log = build_raw_csr_matrix(document_generator, token2index_map) else: raise ValueError("granularity must be either paragraphs or documents") save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_tdm") f = open("derived_data/" + setting_string(**setting) + "__ids", "w") if setting['granularity'] == "paragraphs": for id in id_log: f.write(id[0] + ";" + id[1] + "\n") elif setting['granularity'] == "documents": for id in id_log: f.write(id + "\n") f.close() row2id_map = dict(zip(range(len(id_log)), id_log)) return mat, row2id_map, column2token_map
def get_author_msc_matrix(): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/author_msc_map.npz", "derived_data/author_msc_map__row2author_name.json", "derived_data/author_msc_map__col2msc_code.json"]): mat = load_csr_matrix("derived_data/author_msc_map.npz") with open("derived_data/author_msc_map__row2author_name.json") as f: row2author_map = json.load(f) with open("derived_data/author_msc_map__col2msc_code.json") as f: col2msc_map = json.load(f) return mat, row2author_map, col2msc_map else: author2msc_map = defaultdict(lambda: defaultdict(int)) cursor().execute("""SELECT display_name, msc, COUNT(*) FROM authorship JOIN msc_assignment ON authorship.document = msc_assignment.document WHERE authorship.rank <= 2 AND msc_assignment.pos <= 3 GROUP BY display_name, msc ORDER BY display_name""") for row in cursor(): author2msc_map[row[0]][row[1][:2]] += row[2] author_names, msc_counts = zip(*author2msc_map.items()) msc_code2index_map = dict(zip(msc_classes, range(len(msc_classes)))) col2msc_map = {index: msc for msc, index in msc_code2index_map.items()} mat = build_csr_matrix(msc_counts, token2index_map=msc_code2index_map) save_csr_matrix(mat, "derived_data/author_msc_map") row2author_map = dict(zip(range(len(author_names)), author_names)) with open("derived_data/author_msc_map__row2author_name.json", "w") as f: json.dump(row2author_map, f) with open("derived_data/author_msc_map__col2msc_code.json", "w") as f: json.dump(col2msc_map, f) return mat, row2author_map, col2msc_map
def get_author_theorem_matrix(setting): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz", "derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz") with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json") as f: row2author_map = json.load(f) with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json") as f: col2item_map = json.load(f) return mat, row2author_map, col2item_map else: author_set = set() item_id_set = set() with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f: for line in f: x = line.split(";") if setting['granularity'] == 'documents': item_id_set.add(x[0]) offset = 1 elif setting['granularity'] == 'paragraphs': item_id_set.add((x[0], x[1])) offset = 2 else: raise for i in range(offset, len(x)): author_set.add(x[i].strip()) count = 0 item2index_map = {} with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f: for line in f: item2index_map[line.strip()] = count count += 1 # item2index_map = dict(zip(sorted(list(item_id_set)), range(len(item_id_set)))) author2index_map = dict(zip(sorted(list(author_set)), range(len(author_set)))) author_item_indexes = map(lambda x: {}, range(len(author2index_map))) with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f: for line in f: x = line.split(";") if setting['granularity'] == 'documents': item_index = item2index_map.get(x[0]) offset = 1 elif setting['granularity'] == 'paragraphs': item_index = item2index_map.get((x[0], x[1])) offset = 2 else: raise if item_index is not None: for i in range(offset, len(x)): author_index = author2index_map[x[i].strip()] author_item_indexes[author_index][item_index] = 1.0 mat = build_csr_matrix(list_of_dicts=author_item_indexes, num_attributes=len(item2index_map)) save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_author_matrix") row2author_map = {index: author for author, index in author2index_map.items()} with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "w") as f: json.dump(row2author_map, f) col2item_map = {index: item for item, index in item2index_map.items()} with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json", "w") as f: json.dump(col2item_map, f) return mat, row2author_map, col2item_map
def get_processed_tdm(setting, intended_amount_of_text_tokens=None, intended_amount_of_formula_tokens=None): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__processed_tdm.npz", "derived_data/" + setting_string(**setting) + "__processed_ids", "derived_data/" + setting_string(**setting) + "__processed_token2index_map.json"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__processed_tdm.npz") ids = [] with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f: count = 0 for line in f: x = line.split(";") if setting['granularity'] == 'paragraphs': ids.append((count, (x[0], x[1].strip()))) elif setting['granularity'] == 'documents': ids.append((count, x[0].strip())) else: raise ValueError("granularity must be either 'documents' or 'paragraphs'") row2id_map = dict(ids) with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json") as f: token2index_map = json.load(f) column2token_map = {index: token for token, index in token2index_map.items()} return mat, row2id_map, column2token_map else: # retrieve best tf-idf terms raw_tdm, row2id_map, column2token_map = get_raw_tdm(setting) nz_row_indexes = non_zero_row_indexes(raw_tdm) raw_tdm = raw_tdm[nz_row_indexes, :] token2index_map = get_token2index_map(setting) text_token_scores, formula_token_scores = tf_idf_scores(raw_tdm, token2index_map) best_text_token_indexes, best_formula_token_indexes = select_best_tokens(text_token_scores, formula_token_scores, intended_amount_of_text_tokens, intended_amount_of_formula_tokens) text_tdm = raw_tdm[:, best_text_token_indexes] formula_tdm = raw_tdm[:, best_formula_token_indexes] if text_tdm.shape[1] == 0: processed_tdm = formula_tdm elif formula_tdm == 0: processed_tdm = text_tdm else: float_text_tdm = element_wise_multiply(text_tdm, 1.0) pruned_formula_tdm = element_wise_multiply(formula_tdm, avg_row_norm(text_tdm) / avg_row_norm(formula_tdm)) processed_tdm = vertically_append_matrix(float_text_tdm, pruned_formula_tdm) new_index2old_index_map = {new_index: old_index for new_index, old_index in enumerate(best_text_token_indexes)} new_index2old_index_map.update({new_index+len(best_text_token_indexes): old_index for new_index, old_index in enumerate(best_formula_token_indexes)}) new_token2index_map = {} for new_index, old_index in new_index2old_index_map.items(): new_token2index_map[column2token_map[old_index]] = new_index new_column2token_map = {index: token for token, index in new_token2index_map.items()} new_row2id_map = {} count = 0 for index, id in row2id_map.items(): if index in nz_row_indexes: new_row2id_map[count] = id count += 1 # save processed tdm save_csr_matrix(processed_tdm, "derived_data/" + setting_string(**setting) + "__processed_tdm") # save respective ids with open("derived_data/" + setting_string(**setting) + "__processed_ids", "w") as f: for index, id in sorted(new_row2id_map.items(), key=lambda x: x[0]): f.write(id + "\n") # save token2index map with open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json", "w") as outfile: json.dump(new_token2index_map, outfile) return processed_tdm, new_row2id_map, new_column2token_map
count += 1 theoremTDM = horizontally_combine_matrixes(matrixList) save_csr_matrix(theoremTDM, "derived_data/combined_theorem_text_formula_tdm")""" # train lsa """theoremTDM = load_csr_matrix("derived_data/combined_theorem_text_formula_tdm.npz") svd = TruncatedSVD(n_components=250) svd.fit(theoremTDM) joblib.dump(svd, "models/combined_theorem_text_formula_lsi250_model") """ # perform clustering theoremTDM = load_csr_matrix("derived_data/combined_theorem_text_formula_tdm.npz") svd = joblib.load("models/combined_theorem_text_formula_lsi250_model") LSI_TDM = svd.transform(theoremTDM) ap = AffinityPropagation( damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False ) ap.fit(LSI_TDM) joblib.dump(ap, "models/combined_theorem_text_formula_ap_model")
import json from scipy.sparse import csr_matrix from util import save_csr_matrix, load_csr_matrix, get_dirpath, get_filenames_and_filepaths, DocumentParser, filesInDict from util import connectToDb, bin2NumpyArr import numpy as np from time import time from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument from string import digits, ascii_letters from os.path import isfile, join dirpath = get_dirpath() filenames, filepaths = get_filenames_and_filepaths("raw_data/ntcir_filenames") tdm = load_csr_matrix("derived_data/zb_math_full_text_tdm2.npz") translateMap = json.load(open("derived_data/zb_math_full_texts_tokens2IndexMap")) row_number2fulltext_id_map = json.load(open("derived_data/row_number2fulltext_id_map.json")) phrase = "theorem" tokenizer = DocumentParser.TextTokenizer() tokens = tokenizer.tokenize(phrase) tokenIds = map(lambda token: translateMap[token], tokens) candidateIds = [] index = 0 m = tdm[:, tokenIds] candidateInd = [] currInd = 0 for i in range(len(m.indptr) - 1): diff = m.indptr[i + 1] - m.indptr[i] if diff == len(tokenIds):
target_class = "81" ordered_document_assignments = map( lambda doc_id: doc2msc[str(doc_id)] if doc_id in doc2msc else None, document_ids) ordered_document_labels = map( lambda lab: None if lab is None else (1 if lab[:len(target_class)] == target_class else 0), ordered_document_assignments) test_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/test_doc_ids")) train_doc_ind = indexes_in_list( document_ids, readFileLinewise2Array("raw_data/train_doc_ids")) mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz") train_mat = mat[train_doc_ind, :] train_labels = itemgetter(*train_doc_ind)(ordered_document_labels) svd = TruncatedSVD(n_components=1000) svd.fit(train_mat) test_mat = mat[test_doc_ind, :] test_labels = itemgetter(*test_doc_ind)(ordered_document_labels) clf = svm.LinearSVC() clf.fit(svd.transform(train_mat), train_labels) # eval results predictions = clf.predict(svd.transform(test_mat)).tolist()
# Save CSR-Matrix """c = corpus.toCsrMatrix() save_csr_matrix(c, "abschlussbericht-csr")""" # Save Labels """corpus = ArffJsonCorpus("../zb_math_cluster_experiments/raw_data/abschlussbericht-corpus.json") labels_list = (doc.classes for doc in corpus) with open("abschlussbericht-labels", "w") as f: for labels in labels_list: top_class_labels = set(map(lambda x: x[:2], labels)) f.write(",".join(top_class_labels) + "\n")""" tdm = load_csr_matrix("corpus.npz") labels = read_labels("abschlussbericht-labels") mats = {} def get_transformed_mat(mat, transform_id, transformer_list, test_train): global mats if mats.get((transform_id, test_train)) is None: if transformer_list is not None or not len(transformer_list) == 0: mat_copy = mat for transformer in transformer_list: mat_copy = transformer.transform(mat_copy) mats[(transform_id, test_train)] = mat_copy else: mats[(transform_id, test_train)] = mat
document_ids = json.load(open("derived_data/theorem_tdm_grouped_by_docs_doc_ids")) doc2msc = {} f = open("raw_data/doc2msc") for line in f: x = line.split(";") doc2msc[str(x[0])] = x[1].strip() f.close() target_class = "81" ordered_document_assignments = map(lambda doc_id: doc2msc[str(doc_id)] if doc_id in doc2msc else None, document_ids) ordered_document_labels = map(lambda lab: None if lab is None else (1 if lab[:len(target_class)] == target_class else 0), ordered_document_assignments) test_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/test_doc_ids")) train_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/train_doc_ids")) mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz") train_mat = mat[train_doc_ind, :] train_labels = itemgetter(*train_doc_ind)(ordered_document_labels) svd = TruncatedSVD(n_components=1000) svd.fit(train_mat) test_mat = mat[test_doc_ind, :] test_labels = itemgetter(*test_doc_ind)(ordered_document_labels) clf = svm.LinearSVC() clf.fit(svd.transform(train_mat), train_labels) # eval results predictions = clf.predict(svd.transform(test_mat)).tolist()
import json from scipy.sparse import csr_matrix from util import save_csr_matrix, load_csr_matrix, get_dirpath, get_filenames_and_filepaths, DocumentParser, filesInDict from util import connectToDb, bin2NumpyArr import numpy as np from time import time from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument from string import digits, ascii_letters from os.path import isfile, join dirpath = get_dirpath() filenames, filepaths = get_filenames_and_filepaths("raw_data/ntcir_filenames") tdm = load_csr_matrix("derived_data/zb_math_full_text_tdm2.npz") translateMap = json.load( open("derived_data/zb_math_full_texts_tokens2IndexMap")) row_number2fulltext_id_map = json.load( open("derived_data/row_number2fulltext_id_map.json")) phrase = "theorem" tokenizer = DocumentParser.TextTokenizer() tokens = tokenizer.tokenize(phrase) tokenIds = map(lambda token: translateMap[token], tokens) candidateIds = [] index = 0 m = tdm[:, tokenIds] candidateInd = [] currInd = 0 for i in range(len(m.indptr) - 1):
matrixList.append(combinedFeatures) count += 1 theoremTDM = horizontally_combine_matrixes(matrixList) save_csr_matrix(theoremTDM, "derived_data/combined_theorem_text_formula_tdm")""" # train lsa """theoremTDM = load_csr_matrix("derived_data/combined_theorem_text_formula_tdm.npz") svd = TruncatedSVD(n_components=250) svd.fit(theoremTDM) joblib.dump(svd, "models/combined_theorem_text_formula_lsi250_model") """ # perform clustering theoremTDM = load_csr_matrix( "derived_data/combined_theorem_text_formula_tdm.npz") svd = joblib.load("models/combined_theorem_text_formula_lsi250_model") LSI_TDM = svd.transform(theoremTDM) ap = AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False) ap.fit(LSI_TDM) joblib.dump(ap, "models/combined_theorem_text_formula_ap_model")