def clusterDocument(title, abstract, tokenizer, token2indexMap, tfidf_model, lsi_model, gmm_model): tokens = tokenize(title + " " + abstract, tokenizer) tokenCounts = groupAndCount(tokens) matrix = build_csr_matrix(listOfMaps=[tokenCounts], token2IndexMap=token2indexMap) transformedMatrix = lsi_model.transform(tfidf_model.transform(matrix)) prediction = gmm_model.predict(transformedMatrix)[0] return prediction
def horizontally_combine_matrixes(matrixList): if len(matrixList) == 0: return build_csr_matrix(listOfMaps = [], numAttributes = 0) assert len(matrixList) >= 1 newData = [] newIndices = [] newIndptr = [0] numNonZerosInMat = map(lambda matrix: len(matrix.data), matrixList) numCols = matrixList[0].shape[1] numRows = 0 count = 0 offset = 0 for matrix in matrixList: if matrix.shape[1] != numCols: raise ValueError("Num attributes of matrixes for hirizontal combination must be identical") numRows += matrix.shape[0] newData.extend(matrix.data) newIndices.extend(matrix.indices) newIndptr.extend(map(lambda x: x+offset, matrix.indptr[1:])) offset += numNonZerosInMat[count] count += 1 return csr_matrix((np.array(newData), np.array(newIndices), np.array(newIndptr)), shape=(numRows, numCols))
def doc2mat(cls, doc, tokenizer, token2index_map): tokens = zbMathTokenizer.doc2tokens(doc, tokenizer) feature_vector = group_and_count(tokens) mat = build_csr_matrix([feature_vector], token2index_map=token2index_map) return mat
def horizontally_combine_matrixes(matrixList): if len(matrixList) == 0: return build_csr_matrix(listOfMaps=[], numAttributes=0) assert len(matrixList) >= 1 newData = [] newIndices = [] newIndptr = [0] numNonZerosInMat = map(lambda matrix: len(matrix.data), matrixList) numCols = matrixList[0].shape[1] numRows = 0 count = 0 offset = 0 for matrix in matrixList: if matrix.shape[1] != numCols: raise ValueError( "Num attributes of matrixes for hirizontal combination must be identical" ) numRows += matrix.shape[0] newData.extend(matrix.data) newIndices.extend(matrix.indices) newIndptr.extend(map(lambda x: x + offset, matrix.indptr[1:])) offset += numNonZerosInMat[count] count += 1 return csr_matrix( (np.array(newData), np.array(newIndices), np.array(newIndptr)), shape=(numRows, numCols))
def build_raw_csr_matrix(items, token2index_map): item_maps = [] item_id_log = [] item_count = 1 for item_id, tokens in items: print str(item_id) + " (" + str(item_count) + ")" item_maps.append(group_and_count(tokens)) item_id_log.append(item_id) item_count += 1 m = build_csr_matrix(list_of_maps=item_maps, token_2_index_map=token2index_map) return m, item_id_log
def get_author_msc_matrix(): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/author_msc_map.npz", "derived_data/author_msc_map__row2author_name.json", "derived_data/author_msc_map__col2msc_code.json"]): mat = load_csr_matrix("derived_data/author_msc_map.npz") with open("derived_data/author_msc_map__row2author_name.json") as f: row2author_map = json.load(f) with open("derived_data/author_msc_map__col2msc_code.json") as f: col2msc_map = json.load(f) return mat, row2author_map, col2msc_map else: author2msc_map = defaultdict(lambda: defaultdict(int)) cursor().execute("""SELECT display_name, msc, COUNT(*) FROM authorship JOIN msc_assignment ON authorship.document = msc_assignment.document WHERE authorship.rank <= 2 AND msc_assignment.pos <= 3 GROUP BY display_name, msc ORDER BY display_name""") for row in cursor(): author2msc_map[row[0]][row[1][:2]] += row[2] author_names, msc_counts = zip(*author2msc_map.items()) msc_code2index_map = dict(zip(msc_classes, range(len(msc_classes)))) col2msc_map = {index: msc for msc, index in msc_code2index_map.items()} mat = build_csr_matrix(msc_counts, token2index_map=msc_code2index_map) save_csr_matrix(mat, "derived_data/author_msc_map") row2author_map = dict(zip(range(len(author_names)), author_names)) with open("derived_data/author_msc_map__row2author_name.json", "w") as f: json.dump(row2author_map, f) with open("derived_data/author_msc_map__col2msc_code.json", "w") as f: json.dump(col2msc_map, f) return mat, row2author_map, col2msc_map
def get_author_theorem_matrix(setting): if not force_gen and all(os.path.isfile(filename) for filename in [ "derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz", "derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json"]): mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz") with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json") as f: row2author_map = json.load(f) with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json") as f: col2item_map = json.load(f) return mat, row2author_map, col2item_map else: author_set = set() item_id_set = set() with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f: for line in f: x = line.split(";") if setting['granularity'] == 'documents': item_id_set.add(x[0]) offset = 1 elif setting['granularity'] == 'paragraphs': item_id_set.add((x[0], x[1])) offset = 2 else: raise for i in range(offset, len(x)): author_set.add(x[i].strip()) count = 0 item2index_map = {} with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f: for line in f: item2index_map[line.strip()] = count count += 1 # item2index_map = dict(zip(sorted(list(item_id_set)), range(len(item_id_set)))) author2index_map = dict(zip(sorted(list(author_set)), range(len(author_set)))) author_item_indexes = map(lambda x: {}, range(len(author2index_map))) with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f: for line in f: x = line.split(";") if setting['granularity'] == 'documents': item_index = item2index_map.get(x[0]) offset = 1 elif setting['granularity'] == 'paragraphs': item_index = item2index_map.get((x[0], x[1])) offset = 2 else: raise if item_index is not None: for i in range(offset, len(x)): author_index = author2index_map[x[i].strip()] author_item_indexes[author_index][item_index] = 1.0 mat = build_csr_matrix(list_of_dicts=author_item_indexes, num_attributes=len(item2index_map)) save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_author_matrix") row2author_map = {index: author for author, index in author2index_map.items()} with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "w") as f: json.dump(row2author_map, f) col2item_map = {index: item for item, index in item2index_map.items()} with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json", "w") as f: json.dump(col2item_map, f) return mat, row2author_map, col2item_map
def processFeatureCounts(featureCounts, token2Id, tfidfModel): m = build_csr_matrix(listOfMaps=[featureCounts], token2IndexMap=token2Id) return Normalizer().transform(tfidfModel.transform(m))
import numpy as np from sklearn.cluster import KMeans from sklearn.mixture import GMM from util import build_csr_matrix corpus = ArffJsonCorpus("raw_data/raw_vector.json") TDM = corpus.toCsrMatrix(shapeCols=54334) # build lsi model """lsi_model = TruncatedSVD(n_components=250) lsi_model.fit(TDM) joblib.dump(lsi_model, "models/raw_vector-lsi250_model")""" # build gmm model """lsi_model = joblib.load("models/raw_vector-lsi250_model") gmm_model = GMM(n_components=64) gmm_model.fit(lsi_model.transform(TDM)) joblib.dump(gmm_model, "models/gmm-raw_vector-lsi250")""" #cluster documents f = open("results/clusters-gmm-raw_vector-lsi250", "w") lsi_model = joblib.load("models/raw_vector-lsi250_model") cl_model = joblib.load("models/gmm-raw_vector-lsi250") count = 0 for doc in corpus: doc_vector = lsi_model.transform( build_csr_matrix(dict(doc.data), numAttributes=54334)) f.write(doc.id + ";" + str(cl_model.predict(doc_vector)[0]) + "\n") count += 1 f.close()
import joblib import numpy as np from sklearn.cluster import KMeans from sklearn.mixture import GMM from util import build_csr_matrix corpus = ArffJsonCorpus("raw_data/raw_vector.json") TDM = corpus.toCsrMatrix(shapeCols = 54334) # build lsi model """lsi_model = TruncatedSVD(n_components=250) lsi_model.fit(TDM) joblib.dump(lsi_model, "models/raw_vector-lsi250_model")""" # build gmm model """lsi_model = joblib.load("models/raw_vector-lsi250_model") gmm_model = GMM(n_components=64) gmm_model.fit(lsi_model.transform(TDM)) joblib.dump(gmm_model, "models/gmm-raw_vector-lsi250")""" #cluster documents f = open("results/clusters-gmm-raw_vector-lsi250", "w") lsi_model = joblib.load("models/raw_vector-lsi250_model") cl_model = joblib.load("models/gmm-raw_vector-lsi250") count = 0 for doc in corpus: doc_vector = lsi_model.transform(build_csr_matrix(dict(doc.data), numAttributes=54334)) f.write(doc.id + ";" + str(cl_model.predict(doc_vector)[0]) + "\n") count += 1 f.close()