if __name__ == "__main__": corpusFilepath = "/home/simon/Projekte/MIRS/testing_java_ml_libraries/raw_vector.json" """TDM_full_text = load_csr_matrix("derived_data/zb_math_full_text_tdm.npz") tfidf_trans = TfidfTransformer() tfidf_trans.fit(TDM_full_text) joblib.dump(tfidf_trans, "models/tfidf_full_text_model") TDM_full_text_reweighted = tfidf_trans.transform(TDM_full_text) km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10) km.fit(TDM_full_text_reweighted) joblib.dump(km, "models/km63-full_text_tfidf")""" # g = fitGmmModel(getTDM()) # joblib.dump(g, gmmModelFile) clModel = joblib.load("models/gmm-sklean_lsi250") corpus = ArffJsonCorpus("raw_data/raw_vector.json") lsi_model = joblib.load("models/lsi250-model") log = open("results/clusters-gmm-sklean_lsi250", "w") for doc in corpus: sparseDoc = sparseData2Matrix(doc, 54334) arr = lsi_model.transform(sparseDoc) log.write(doc.id + ";" + str(clModel.predict(arr)[0]) + "\n") log.flush() log.close()
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import KMeans corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols = 54334) """svd = TruncatedSVD(n_components=250) svd.fit(TDM) joblib.dump(svd, "lsi250-model")""" svd2 = joblib.load("lsi250-model") LSI_TDM = svd2.transform(TDM) km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10) km.fit(LSI_TDM) joblib.dump(km, "km63-sklean_lsi250") """clModel = joblib.load("km63-sklean_lsi250") # log = open("clusters-km63-sklearn_lsi250", "w") log = open("foo", "w") count = 0 for arr in LSI_TDM: # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex) log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n") count += 1 log.close()"""
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import AffinityPropagation, MeanShift from sklearn.mixture import GMM import random random.seed(0) corpusFilepath = "raw_data/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols = 54334, selection = lambda doc: True if random.random() < 0.1 else False) print "TDM shape: " + str(TDM.shape) svd2 = joblib.load("models/lsi250-model") LSI_TDM = svd2.transform(TDM) #ap = AffinityPropagation( # damping=0.95, # max_iter=200, # convergence_iter=15, # copy=True, # preference=None, # affinity='euclidean', # verbose=False #) # ap.fit(LSI_TDM) """ms = MeanShift(
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument from sklearn.feature_selection import chi2 import math from sklearn.cluster import KMeans import numpy as np import joblib import json from util import get_index_to_word_map, dumpChiScores, chiSetGeq, readChiFile corpusFilepath = "/home/simon/Projekte/MIRS/testing_java_ml_libraries/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) """TDM = corpus.toCsrMatrix() labelMatrix, classLabel2Number, classNumber2Label = initializeLabelMatrix(corpus) dumpChiScores(TDM, labelMatrix, classNumber2Label)""" """chiSet = chiSetGeq(2000.0) TDM, index2chiIndex = ArffJsonCorpus(corpusFilepath).toCsrMatrix(chiSet) f = open("index2chiIndex.json", "w") print >> f, json.dumps(index2chiIndex) f.close() km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10) km.fit(TDM) joblib.dump(km, "km63-allchi2geq2000")""" # index2chiIndex = dict(map(lambda x: (int(x[0]), x[1]), json.load(open("derived_data/index2chiIndex.json")).items())) index2Word = get_index_to_word_map("../testing_java_ml_libraries/dict") print "00 - History: " + repr( map(lambda x: index2Word[x[0]],
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import KMeans from sklearn.mixture import GMM from util import build_csr_matrix corpus = ArffJsonCorpus("raw_data/raw_vector.json") TDM = corpus.toCsrMatrix(shapeCols=54334) # build lsi model """lsi_model = TruncatedSVD(n_components=250) lsi_model.fit(TDM) joblib.dump(lsi_model, "models/raw_vector-lsi250_model")""" # build gmm model """lsi_model = joblib.load("models/raw_vector-lsi250_model") gmm_model = GMM(n_components=64) gmm_model.fit(lsi_model.transform(TDM)) joblib.dump(gmm_model, "models/gmm-raw_vector-lsi250")""" #cluster documents f = open("results/clusters-gmm-raw_vector-lsi250", "w") lsi_model = joblib.load("models/raw_vector-lsi250_model") cl_model = joblib.load("models/gmm-raw_vector-lsi250") count = 0 for doc in corpus: doc_vector = lsi_model.transform( build_csr_matrix(dict(doc.data), numAttributes=54334))
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import AffinityPropagation, MeanShift from sklearn.mixture import GMM import random random.seed(0) corpusFilepath = "raw_data/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols=54334, selection=lambda doc: True if random.random() < 0.1 else False) print "TDM shape: " + str(TDM.shape) svd2 = joblib.load("models/lsi250-model") LSI_TDM = svd2.transform(TDM) #ap = AffinityPropagation( # damping=0.95, # max_iter=200, # convergence_iter=15, # copy=True, # preference=None, # affinity='euclidean', # verbose=False #) # ap.fit(LSI_TDM)
from sklearn.decomposition import TruncatedSVD from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument import joblib import numpy as np from sklearn.cluster import KMeans corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json" corpus = ArffJsonCorpus(corpusFilepath) TDM = corpus.toCsrMatrix(shapeCols=54334) """svd = TruncatedSVD(n_components=250) svd.fit(TDM) joblib.dump(svd, "lsi250-model")""" svd2 = joblib.load("lsi250-model") LSI_TDM = svd2.transform(TDM) km = KMeans(n_clusters=63, init="k-means++", max_iter=100, n_init=10) km.fit(LSI_TDM) joblib.dump(km, "km63-sklean_lsi250") """clModel = joblib.load("km63-sklean_lsi250") # log = open("clusters-km63-sklearn_lsi250", "w") log = open("foo", "w") count = 0 for arr in LSI_TDM: # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex) log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n") count += 1 log.close()"""