def clusterDocument(title, abstract, tokenizer, token2indexMap, tfidf_model, lsi_model, gmm_model):
    tokens = tokenize(title + " " + abstract, tokenizer)
    tokenCounts = groupAndCount(tokens)
    matrix = build_csr_matrix(listOfMaps=[tokenCounts], token2IndexMap=token2indexMap)
    transformedMatrix = lsi_model.transform(tfidf_model.transform(matrix))
    prediction = gmm_model.predict(transformedMatrix)[0]
    return prediction
def horizontally_combine_matrixes(matrixList):
    if len(matrixList) == 0:
        return build_csr_matrix(listOfMaps = [], numAttributes = 0)
    
    assert len(matrixList) >= 1

    newData = []
    newIndices = []
    newIndptr = [0]

    numNonZerosInMat = map(lambda matrix: len(matrix.data), matrixList)
    numCols = matrixList[0].shape[1]
    numRows = 0

    count = 0
    offset = 0
    for matrix in matrixList:
        if matrix.shape[1] != numCols:
            raise ValueError("Num attributes of matrixes for hirizontal combination must be identical")
        numRows += matrix.shape[0]

        newData.extend(matrix.data)
        newIndices.extend(matrix.indices)
        newIndptr.extend(map(lambda x: x+offset, matrix.indptr[1:]))

        offset += numNonZerosInMat[count]
        count += 1

    return csr_matrix((np.array(newData), np.array(newIndices), np.array(newIndptr)), shape=(numRows, numCols))
    def doc2mat(cls, doc, tokenizer, token2index_map):
        tokens = zbMathTokenizer.doc2tokens(doc, tokenizer)
        feature_vector = group_and_count(tokens)

        mat = build_csr_matrix([feature_vector], token2index_map=token2index_map)

        return mat
def horizontally_combine_matrixes(matrixList):
    if len(matrixList) == 0:
        return build_csr_matrix(listOfMaps=[], numAttributes=0)

    assert len(matrixList) >= 1

    newData = []
    newIndices = []
    newIndptr = [0]

    numNonZerosInMat = map(lambda matrix: len(matrix.data), matrixList)
    numCols = matrixList[0].shape[1]
    numRows = 0

    count = 0
    offset = 0
    for matrix in matrixList:
        if matrix.shape[1] != numCols:
            raise ValueError(
                "Num attributes of matrixes for hirizontal combination must be identical"
            )
        numRows += matrix.shape[0]

        newData.extend(matrix.data)
        newIndices.extend(matrix.indices)
        newIndptr.extend(map(lambda x: x + offset, matrix.indptr[1:]))

        offset += numNonZerosInMat[count]
        count += 1

    return csr_matrix(
        (np.array(newData), np.array(newIndices), np.array(newIndptr)),
        shape=(numRows, numCols))
Beispiel #5
0
def clusterDocument(title, abstract, tokenizer, token2indexMap, tfidf_model,
                    lsi_model, gmm_model):
    tokens = tokenize(title + " " + abstract, tokenizer)
    tokenCounts = groupAndCount(tokens)
    matrix = build_csr_matrix(listOfMaps=[tokenCounts],
                              token2IndexMap=token2indexMap)
    transformedMatrix = lsi_model.transform(tfidf_model.transform(matrix))
    prediction = gmm_model.predict(transformedMatrix)[0]
    return prediction
Beispiel #6
0
def build_raw_csr_matrix(items, token2index_map):
    item_maps = []
    item_id_log = []

    item_count = 1
    for item_id, tokens in items:
        print str(item_id) + " (" + str(item_count) + ")"

        item_maps.append(group_and_count(tokens))
        item_id_log.append(item_id)
        item_count += 1

    m = build_csr_matrix(list_of_maps=item_maps, token_2_index_map=token2index_map)
    return m, item_id_log
Beispiel #7
0
def get_author_msc_matrix():
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/author_msc_map.npz",
            "derived_data/author_msc_map__row2author_name.json",
            "derived_data/author_msc_map__col2msc_code.json"]):

        mat = load_csr_matrix("derived_data/author_msc_map.npz")
        with open("derived_data/author_msc_map__row2author_name.json") as f:
            row2author_map = json.load(f)

        with open("derived_data/author_msc_map__col2msc_code.json") as f:
            col2msc_map = json.load(f)

        return mat, row2author_map, col2msc_map
    else:
        author2msc_map = defaultdict(lambda: defaultdict(int))
        cursor().execute("""SELECT display_name, msc, COUNT(*) FROM authorship
                            JOIN msc_assignment ON authorship.document = msc_assignment.document
                        WHERE authorship.rank <= 2 AND msc_assignment.pos <= 3
                        GROUP BY display_name, msc
                        ORDER BY display_name""")

        for row in cursor():
            author2msc_map[row[0]][row[1][:2]] += row[2]

        author_names, msc_counts = zip(*author2msc_map.items())

        msc_code2index_map = dict(zip(msc_classes, range(len(msc_classes))))
        col2msc_map = {index: msc for msc, index in msc_code2index_map.items()}

        mat = build_csr_matrix(msc_counts, token2index_map=msc_code2index_map)
        save_csr_matrix(mat, "derived_data/author_msc_map")

        row2author_map = dict(zip(range(len(author_names)), author_names))
        with open("derived_data/author_msc_map__row2author_name.json", "w") as f:
            json.dump(row2author_map, f)

        with open("derived_data/author_msc_map__col2msc_code.json", "w") as f:
            json.dump(col2msc_map, f)

        return mat, row2author_map, col2msc_map
Beispiel #8
0
def get_author_theorem_matrix(setting):
    if not force_gen and all(os.path.isfile(filename) for filename in [
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz",
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json",
            "derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json"]):

        mat = load_csr_matrix("derived_data/" + setting_string(**setting) + "__raw_author_matrix.npz")
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json") as f:
            row2author_map = json.load(f)

        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json") as f:
            col2item_map = json.load(f)

        return mat, row2author_map, col2item_map
    else:
        author_set = set()
        item_id_set = set()
        with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f:
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'documents':
                    item_id_set.add(x[0])
                    offset = 1
                elif setting['granularity'] == 'paragraphs':
                    item_id_set.add((x[0], x[1]))
                    offset = 2
                else:
                    raise

                for i in range(offset, len(x)):
                    author_set.add(x[i].strip())

        count = 0
        item2index_map = {}
        with open("derived_data/" + setting_string(**setting) + "__processed_ids") as f:
            for line in f:
                item2index_map[line.strip()] = count
                count += 1

        # item2index_map = dict(zip(sorted(list(item_id_set)), range(len(item_id_set))))
        author2index_map = dict(zip(sorted(list(author_set)), range(len(author_set))))

        author_item_indexes = map(lambda x: {}, range(len(author2index_map)))
        with open("derived_data/" + setting_string(**setting) + "__ids_with_authors") as f:
            for line in f:
                x = line.split(";")
                if setting['granularity'] == 'documents':
                    item_index = item2index_map.get(x[0])
                    offset = 1
                elif setting['granularity'] == 'paragraphs':
                    item_index = item2index_map.get((x[0], x[1]))
                    offset = 2
                else:
                    raise

                if item_index is not None:
                    for i in range(offset, len(x)):
                        author_index = author2index_map[x[i].strip()]
                        author_item_indexes[author_index][item_index] = 1.0

        mat = build_csr_matrix(list_of_dicts=author_item_indexes, num_attributes=len(item2index_map))
        save_csr_matrix(mat, "derived_data/" + setting_string(**setting) + "__raw_author_matrix")

        row2author_map = {index: author for author, index in author2index_map.items()}
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_row2author.json", "w") as f:
            json.dump(row2author_map, f)

        col2item_map = {index: item for item, index in item2index_map.items()}
        with open("derived_data/" + setting_string(**setting) + "__raw_author_matrix_col2item.json", "w") as f:
            json.dump(col2item_map, f)

        return mat, row2author_map, col2item_map
def processFeatureCounts(featureCounts, token2Id, tfidfModel):
    m = build_csr_matrix(listOfMaps=[featureCounts], token2IndexMap=token2Id)
    return Normalizer().transform(tfidfModel.transform(m))
Beispiel #10
0
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from util import build_csr_matrix

corpus = ArffJsonCorpus("raw_data/raw_vector.json")
TDM = corpus.toCsrMatrix(shapeCols=54334)

# build lsi model
"""lsi_model = TruncatedSVD(n_components=250)
lsi_model.fit(TDM)
joblib.dump(lsi_model, "models/raw_vector-lsi250_model")"""

# build gmm model
"""lsi_model = joblib.load("models/raw_vector-lsi250_model")
gmm_model = GMM(n_components=64)
gmm_model.fit(lsi_model.transform(TDM))
joblib.dump(gmm_model, "models/gmm-raw_vector-lsi250")"""

#cluster documents
f = open("results/clusters-gmm-raw_vector-lsi250", "w")
lsi_model = joblib.load("models/raw_vector-lsi250_model")
cl_model = joblib.load("models/gmm-raw_vector-lsi250")

count = 0
for doc in corpus:
    doc_vector = lsi_model.transform(
        build_csr_matrix(dict(doc.data), numAttributes=54334))
    f.write(doc.id + ";" + str(cl_model.predict(doc_vector)[0]) + "\n")
    count += 1
f.close()
def processFeatureCounts(featureCounts, token2Id, tfidfModel):
    m = build_csr_matrix(listOfMaps=[featureCounts], token2IndexMap=token2Id)
    return Normalizer().transform(tfidfModel.transform(m))
import joblib
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GMM
from util import build_csr_matrix

corpus = ArffJsonCorpus("raw_data/raw_vector.json")
TDM = corpus.toCsrMatrix(shapeCols = 54334)

# build lsi model
"""lsi_model = TruncatedSVD(n_components=250)
lsi_model.fit(TDM)
joblib.dump(lsi_model, "models/raw_vector-lsi250_model")"""

# build gmm model
"""lsi_model = joblib.load("models/raw_vector-lsi250_model")
gmm_model = GMM(n_components=64)
gmm_model.fit(lsi_model.transform(TDM))
joblib.dump(gmm_model, "models/gmm-raw_vector-lsi250")"""

#cluster documents
f = open("results/clusters-gmm-raw_vector-lsi250", "w")
lsi_model = joblib.load("models/raw_vector-lsi250_model")
cl_model = joblib.load("models/gmm-raw_vector-lsi250")

count = 0
for doc in corpus:
    doc_vector = lsi_model.transform(build_csr_matrix(dict(doc.data), numAttributes=54334))
    f.write(doc.id + ";" + str(cl_model.predict(doc_vector)[0]) + "\n")
    count += 1
f.close()