Esempio n. 1
0
if __name__ == "__main__":
    corpusFilepath = "/home/simon/Projekte/MIRS/testing_java_ml_libraries/raw_vector.json"
    """TDM_full_text = load_csr_matrix("derived_data/zb_math_full_text_tdm.npz")
    tfidf_trans = TfidfTransformer()
    tfidf_trans.fit(TDM_full_text)

    joblib.dump(tfidf_trans, "models/tfidf_full_text_model")
    TDM_full_text_reweighted = tfidf_trans.transform(TDM_full_text)
    km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10)
    km.fit(TDM_full_text_reweighted)

    joblib.dump(km, "models/km63-full_text_tfidf")"""

    # g = fitGmmModel(getTDM())
    # joblib.dump(g, gmmModelFile)

    clModel = joblib.load("models/gmm-sklean_lsi250")
    corpus = ArffJsonCorpus("raw_data/raw_vector.json")
    lsi_model = joblib.load("models/lsi250-model")

    log = open("results/clusters-gmm-sklean_lsi250", "w")

    for doc in corpus:
        sparseDoc = sparseData2Matrix(doc, 54334)
        arr = lsi_model.transform(sparseDoc)
        log.write(doc.id + ";" + str(clModel.predict(arr)[0]) + "\n")
        log.flush()

    log.close()
from sklearn.decomposition import TruncatedSVD
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
import joblib
import numpy as np
from sklearn.cluster import KMeans

corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json"
corpus = ArffJsonCorpus(corpusFilepath)
TDM = corpus.toCsrMatrix(shapeCols = 54334)

"""svd = TruncatedSVD(n_components=250)
svd.fit(TDM)
joblib.dump(svd, "lsi250-model")"""

svd2 = joblib.load("lsi250-model")
LSI_TDM = svd2.transform(TDM)

km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10)
km.fit(LSI_TDM)
joblib.dump(km, "km63-sklean_lsi250")

"""clModel = joblib.load("km63-sklean_lsi250")
# log = open("clusters-km63-sklearn_lsi250", "w")
log = open("foo", "w")
count = 0
for arr in LSI_TDM:
    # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex)
    log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n")
    count += 1
log.close()"""
from sklearn.decomposition import TruncatedSVD
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
import joblib
import numpy as np
from sklearn.cluster import AffinityPropagation, MeanShift
from sklearn.mixture import GMM
import random

random.seed(0)

corpusFilepath = "raw_data/raw_vector.json"
corpus = ArffJsonCorpus(corpusFilepath)
TDM = corpus.toCsrMatrix(shapeCols = 54334, selection = lambda doc: True if random.random() < 0.1 else False)
print "TDM shape: " + str(TDM.shape)

svd2 = joblib.load("models/lsi250-model")
LSI_TDM = svd2.transform(TDM)

#ap = AffinityPropagation(
#    damping=0.95, 
#    max_iter=200, 
#    convergence_iter=15, 
#    copy=True, 
#    preference=None, 
#    affinity='euclidean', 
#    verbose=False
#)

# ap.fit(LSI_TDM)

"""ms = MeanShift(
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
from sklearn.feature_selection import chi2
import math
from sklearn.cluster import KMeans
import numpy as np
import joblib
import json
from util import get_index_to_word_map, dumpChiScores, chiSetGeq, readChiFile

corpusFilepath = "/home/simon/Projekte/MIRS/testing_java_ml_libraries/raw_vector.json"
corpus = ArffJsonCorpus(corpusFilepath)
"""TDM = corpus.toCsrMatrix()
labelMatrix, classLabel2Number, classNumber2Label = initializeLabelMatrix(corpus)

dumpChiScores(TDM, labelMatrix, classNumber2Label)"""
"""chiSet = chiSetGeq(2000.0)
TDM, index2chiIndex = ArffJsonCorpus(corpusFilepath).toCsrMatrix(chiSet)

f = open("index2chiIndex.json", "w")
print >> f, json.dumps(index2chiIndex)
f.close()

km = KMeans(n_clusters=63, init='k-means++', max_iter=100, n_init=10)
km.fit(TDM)
joblib.dump(km, "km63-allchi2geq2000")"""

# index2chiIndex = dict(map(lambda x: (int(x[0]), x[1]), json.load(open("derived_data/index2chiIndex.json")).items()))
index2Word = get_index_to_word_map("../testing_java_ml_libraries/dict")

print "00 - History: " + repr(
    map(lambda x: index2Word[x[0]],
Esempio n. 5
0
from sklearn.decomposition import TruncatedSVD
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
import joblib
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from util import build_csr_matrix

corpus = ArffJsonCorpus("raw_data/raw_vector.json")
TDM = corpus.toCsrMatrix(shapeCols=54334)

# build lsi model
"""lsi_model = TruncatedSVD(n_components=250)
lsi_model.fit(TDM)
joblib.dump(lsi_model, "models/raw_vector-lsi250_model")"""

# build gmm model
"""lsi_model = joblib.load("models/raw_vector-lsi250_model")
gmm_model = GMM(n_components=64)
gmm_model.fit(lsi_model.transform(TDM))
joblib.dump(gmm_model, "models/gmm-raw_vector-lsi250")"""

#cluster documents
f = open("results/clusters-gmm-raw_vector-lsi250", "w")
lsi_model = joblib.load("models/raw_vector-lsi250_model")
cl_model = joblib.load("models/gmm-raw_vector-lsi250")

count = 0
for doc in corpus:
    doc_vector = lsi_model.transform(
        build_csr_matrix(dict(doc.data), numAttributes=54334))
from sklearn.decomposition import TruncatedSVD
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
import joblib
import numpy as np
from sklearn.cluster import AffinityPropagation, MeanShift
from sklearn.mixture import GMM
import random

random.seed(0)

corpusFilepath = "raw_data/raw_vector.json"
corpus = ArffJsonCorpus(corpusFilepath)
TDM = corpus.toCsrMatrix(shapeCols=54334,
                         selection=lambda doc: True
                         if random.random() < 0.1 else False)
print "TDM shape: " + str(TDM.shape)

svd2 = joblib.load("models/lsi250-model")
LSI_TDM = svd2.transform(TDM)

#ap = AffinityPropagation(
#    damping=0.95,
#    max_iter=200,
#    convergence_iter=15,
#    copy=True,
#    preference=None,
#    affinity='euclidean',
#    verbose=False
#)

# ap.fit(LSI_TDM)
from sklearn.decomposition import TruncatedSVD
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
import joblib
import numpy as np
from sklearn.cluster import KMeans

corpusFilepath = "/home/simon/Projekte/zbMathClustering/raw_vector.json"
corpus = ArffJsonCorpus(corpusFilepath)
TDM = corpus.toCsrMatrix(shapeCols=54334)

"""svd = TruncatedSVD(n_components=250)
svd.fit(TDM)
joblib.dump(svd, "lsi250-model")"""

svd2 = joblib.load("lsi250-model")
LSI_TDM = svd2.transform(TDM)

km = KMeans(n_clusters=63, init="k-means++", max_iter=100, n_init=10)
km.fit(LSI_TDM)
joblib.dump(km, "km63-sklean_lsi250")

"""clModel = joblib.load("km63-sklean_lsi250")
# log = open("clusters-km63-sklearn_lsi250", "w")
log = open("foo", "w")
count = 0
for arr in LSI_TDM:
    # npArray = sparseData2Matrix(doc.data, len(index2chiIndex), index2chiIndex)
    log.write(doc.id + ";" + str(clModel.predict(npArray)[0]) + "\n")
    count += 1
log.close()"""