def getArts(fname, minFreq):
    db = ArticleDB.ArticleDB(common.dataFile(fname), mode = 'open')
    import ipyutils
    language = 'eng'
    ipyutils.loadDicts(prefix = 'gensim_' + language)
    arts = [Article.Article(rec) for rec in db.db if rec['language'] == language]
    for art in arts:
        art.msc = list(set(mscs.niceMSC(msc, prefix = 2)[0] for msc in art.msc))
    logging.info('loaded %i articles from %s' % (len(arts), fname))
    arts = [art for art in arts if len(art.msc)==1 and art.body and art.id_int in ipyutils.rdocids]
    logging.info('extracted %i articles with exactly one MSC and non-empty body' % (len(arts)))
    okmsc = mscs.getFreqMSCs(arts, minFreq, useMains = True)
    mscs.printStats(arts)
    for art in arts:
        art.body = art.body.decode('utf8')
#        art.body = art.body.decode('ascii', 'ignore')
        art.body = art.body.translate(BAD_MAP).encode('utf8')
        if art.title:
            art.title = art.title.decode('utf8')
    #        art.title = art.title.decode('ascii', 'ignore')
            art.title = art.title.translate(BAD_MAP).encode('utf8')
        art.msc = [mscs.niceMSC(art.msc[0], prefix = 2)[0]]
    arts = [art for art in arts if art.msc[0] in okmsc]
    allmsc, mainmsc = mscs.getMSCCnts(arts)
    for msc, mscarts in allmsc.iteritems():
        logging.info("class %s: %i articles" % (msc, len(mscarts)))
    logging.info("======================")
    logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues()))
        
    logging.debug('using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq))
    return arts
Beispiel #2
0
def buildMscOverlapMatrix(lang):
    logging.info("building MSC overlap matrix")

    arts = [
        art for art in docsim.getArts(
            ARTS_FILE, acceptNoBody=True, acceptNoMsc=False)
        if art.language == lang or lang == 'any'
    ]
    cats, rcats = loadMsc2Id(lang)  # from buildPure

    logging.info("computing MSC matrices")

    overlap = numpy.zeros(
        (len(cats), len(cats)), dtype=int
    )  # binary msc similarity (with fixed msc hierarchy = identity)
    for art in arts:
        for msc1 in art.msc:
            for msc2 in art.msc:
                overlap[rcats[mscs.niceMSC(msc1)[0]],
                        rcats[mscs.niceMSC(msc2)[0]]] += 1

    resultOverlap = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32)
    for i in xrange(resultOverlap.shape[0]):
        max = numpy.max(overlap[i])
        for j in xrange(resultOverlap.shape[1]):
            resultOverlap[i, j] = math.log(1.0 + 100.0 * overlap[i, j] /
                                           max) / math.log(101)

    matutils.saveMatrix(resultOverlap,
                        common.matrixFile("mscs_overlap_%s.mm" % lang),
                        sparse=False)
    return resultOverlap
def buildMscOverlapMatrix(lang):
    logging.info("building MSC overlap matrix")

    arts = [
        art
        for art in docsim.getArts(ARTS_FILE, acceptNoBody=True, acceptNoMsc=False)
        if art.language == lang or lang == "any"
    ]
    cats, rcats = loadMsc2Id(lang)  # from buildPure

    logging.info("computing MSC matrices")

    overlap = numpy.zeros(
        (len(cats), len(cats)), dtype=int
    )  # binary msc similarity (with fixed msc hierarchy = identity)
    for art in arts:
        for msc1 in art.msc:
            for msc2 in art.msc:
                overlap[rcats[mscs.niceMSC(msc1)[0]], rcats[mscs.niceMSC(msc2)[0]]] += 1

    resultOverlap = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32)
    for i in xrange(resultOverlap.shape[0]):
        max = numpy.max(overlap[i])
        for j in xrange(resultOverlap.shape[1]):
            resultOverlap[i, j] = math.log(1.0 + 100.0 * overlap[i, j] / max) / math.log(101)

    matutils.saveMatrix(resultOverlap, common.matrixFile("mscs_overlap_%s.mm" % lang), sparse=False)
    return resultOverlap
def buildDocDoc(arts, type, language):
    ipyutils.loadDicts(prefix="gensim_" + language)
    arts = [art for art in arts if art.id_int in ipyutils.rdocids]
    assert len(arts) == len(ipyutils.rdocids)

    logging.info("loading msc<->id mapping")
    cats, rcats = loadMsc2Id(language)

    mscsFile = common.matrixFile("mscs_%s.mm" % type)
    matMsc = matutils.loadMatrix(mscsFile)
    mscDict = {}
    for art in arts:
        artId = ipyutils.rdocids[art.id_int]
        mscIds = [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc]
        mscDict[artId] = mscIds

    logging.info("computing doc*doc similarity matrix based on %s" % mscsFile)
    docdoc = numpy.zeros((len(arts), len(arts)), numpy.float32)

    for i in xrange(len(arts)):
        if i % 100 == 0:
            logging.info("PROGRESS: %i/%i" % (i, len(arts)))
        art1Id = ipyutils.rdocids[arts[i].id_int]
        for j in xrange(i, len(arts)):
            art2Id = ipyutils.rdocids[arts[j].id_int]
            bestScore = 0.0
            for msc1Id in mscDict[art1Id]:
                for msc2Id in mscDict[art2Id]:
                    bestScore = max(bestScore, matMsc[msc1Id, msc2Id])
            docdoc[art1Id, art2Id] = docdoc[art2Id, art1Id] = bestScore

    matutils.saveMatrix(docdoc, common.matrixFile("docdoc_" + language + "_%s.mm" % type), sparse=False)
    return docdoc
def getArtsWithMsc(mscCode):
    import mscs
    mscDict = dict([
        (art.id_int,
         [mscs.niceMSC(msc, prefix=len(mscCode))[0] for msc in art.msc])
        for art in arts
    ])  # create mapping of internal id -> list of top category MSCs
    return [art.id_int for art in arts if mscCode in mscDict[art.id_int]]
Beispiel #6
0
def getArts(fname, minFreq):
    db = ArticleDB.ArticleDB(common.dataFile(fname), mode='open')
    import ipyutils
    language = 'eng'
    ipyutils.loadDicts(prefix='gensim_' + language)
    arts = [
        Article.Article(rec) for rec in db.db if rec['language'] == language
    ]
    for art in arts:
        art.msc = list(set(mscs.niceMSC(msc, prefix=2)[0] for msc in art.msc))
    logging.info('loaded %i articles from %s' % (len(arts), fname))
    arts = [
        art for art in arts
        if len(art.msc) == 1 and art.body and art.id_int in ipyutils.rdocids
    ]
    logging.info(
        'extracted %i articles with exactly one MSC and non-empty body' %
        (len(arts)))
    okmsc = mscs.getFreqMSCs(arts, minFreq, useMains=True)
    mscs.printStats(arts)
    for art in arts:
        art.body = art.body.decode('utf8')
        #        art.body = art.body.decode('ascii', 'ignore')
        art.body = art.body.translate(BAD_MAP).encode('utf8')
        if art.title:
            art.title = art.title.decode('utf8')
            #        art.title = art.title.decode('ascii', 'ignore')
            art.title = art.title.translate(BAD_MAP).encode('utf8')
        art.msc = [mscs.niceMSC(art.msc[0], prefix=2)[0]]
    arts = [art for art in arts if art.msc[0] in okmsc]
    allmsc, mainmsc = mscs.getMSCCnts(arts)
    for msc, mscarts in allmsc.iteritems():
        logging.info("class %s: %i articles" % (msc, len(mscarts)))
    logging.info("======================")
    logging.info("sum: %i articles" %
                 sum(len(mscarts) for mscarts in allmsc.itervalues()))

    logging.debug(
        'using %i articles from all %s msc classes that are covered by at least %i articles'
        % (len(arts), sorted(list(okmsc)), minFreq))
    return arts
def toNLTK(_mat, prefix=2):
    import mscs
    if scipy.sparse.issparse(_mat):
        mat = _mat.tocsc()
    else:
        mat = _mat
    result = []
    for i in xrange(mat.shape[1]):
        if scipy.sparse.issparse(mat):
            a0 = mat[:, i].toarray()
        else:
            a0 = mat[:, i]
        nnind = a0.nonzero()[0]
        nnvals = a0.take(nnind)
        features = dict(zip(nnind, nnvals))
        id = docids[i]
        okarts = [a for a in arts if a.id_int == id]
        if len(okarts) != 1:
            raise Exception('%i articles with id=%s' % repr(id))
        labels = okarts[0].msc
        if len(labels) < 1:
            raise Exception('no msc for %s' % id)
        result.append((features, mscs.niceMSC(labels[0], prefix=prefix)[0]))
    return result
Beispiel #8
0
def buildDocDoc(arts, type, language):
    ipyutils.loadDicts(prefix='gensim_' + language)
    arts = [art for art in arts if art.id_int in ipyutils.rdocids]
    assert (len(arts) == len(ipyutils.rdocids))

    logging.info("loading msc<->id mapping")
    cats, rcats = loadMsc2Id(language)

    mscsFile = common.matrixFile("mscs_%s.mm" % type)
    matMsc = matutils.loadMatrix(mscsFile)
    mscDict = {}
    for art in arts:
        artId = ipyutils.rdocids[art.id_int]
        mscIds = [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc]
        mscDict[artId] = mscIds

    logging.info("computing doc*doc similarity matrix based on %s" % mscsFile)
    docdoc = numpy.zeros((len(arts), len(arts)), numpy.float32)

    for i in xrange(len(arts)):
        if i % 100 == 0:
            logging.info("PROGRESS: %i/%i" % (i, len(arts)))
        art1Id = ipyutils.rdocids[arts[i].id_int]
        for j in xrange(i, len(arts)):
            art2Id = ipyutils.rdocids[arts[j].id_int]
            bestScore = 0.0
            for msc1Id in mscDict[art1Id]:
                for msc2Id in mscDict[art2Id]:
                    bestScore = max(bestScore, matMsc[msc1Id, msc2Id])
            docdoc[art1Id, art2Id] = docdoc[art2Id, art1Id] = bestScore

    matutils.saveMatrix(docdoc,
                        common.matrixFile("docdoc_" + language +
                                          "_%s.mm" % type),
                        sparse=False)
    return docdoc
def toNLTK(_mat, prefix = 2):
    import mscs
    if scipy.sparse.issparse(_mat):
        mat = _mat.tocsc()
    else:
        mat = _mat
    result = []
    for i in xrange(mat.shape[1]):
        if scipy.sparse.issparse(mat):
            a0 = mat[:, i].toarray()
        else:
            a0 = mat[:, i]
        nnind = a0.nonzero()[0]
        nnvals = a0.take(nnind)
        features = dict(zip(nnind, nnvals))
        id = docids[i]
        okarts = [a for a in arts if a.id_int == id]
        if len(okarts) != 1:
            raise Exception('%i articles with id=%s' % repr(id))
        labels = okarts[0].msc
        if len(labels) < 1:
            raise Exception('no msc for %s' % id)
        result.append((features, mscs.niceMSC(labels[0], prefix = prefix)[0]))
    return result
    matLum = getLum(mat).astype(numpy.uint8)
    fname = "%s_sim_lum%i_%s.png" % (SIM_TYPE, LUM_SCALE, id)
    logging.debug("saving msc=%s similarity luminescence matrix to %s" %
                  (id, fname))
    logging.info("saving to %s" % fname)
    i = Image.fromarray(matLum, 'L')
    i.save(fname)


logging.info("loading articles")
#arts = docsim.getArts(common.dbFile('serial_eng', '1msc'))
arts = docsim.getArts('/home/radim/workspace/data/dml/results/serial_msc.pdl')

for art in arts:
    art.fullmsc = art.msc[:]
    art.msc = tuple(removeDup([mscs.niceMSC(msc)[0] for msc in art.msc]))
arts = [
    art for art in arts if art.id_int in ipyutils.rdocids
    and art.language == "eng" and len(art.msc) == 1
]

art2msc = [(art.fullmsc, art.id_int, art.msc) for art in arts]
art2msc.sort()

del arts

logging.info("len(art2msc)=%i" % len(art2msc))
print "first ten art2msc:", art2msc[:10]
print "last ten art2msc:", art2msc[-10:]

art2mscOld = art2msc[:]
Beispiel #11
0
def buildMscCentroidMatrix(language):
    logging.info("building MSC centroid matrix from %s" % ARTS_FILE)
    arts = [
        art for art in docsim.getArts(
            ARTS_FILE, acceptNoBody=False, acceptNoMsc=False)
        if art.language == language or language == 'any'
    ]
    prefix = 'mscs_serial_%s_' % language
    matFile = common.matrixFile(prefix + 'TFIDF_T.mm')
    if os.path.exists(matFile):
        logging.warning(
            'SKIPPING creating TFIDF matrix for %s (file %s present). Is this what you wanted?'
            % (language, matFile))
        tfidf = matutils.loadMatrix(matFile).tocsr()
    else:
        logging.info('creating TFIDF matrix for %s to %s' %
                     (language, matFile))
        tfidf = docsim.buildTFIDFMatrices(arts,
                                          prefix=prefix,
                                          saveMatrices=False).tocsr()

    ipyutils.loadDicts(prefix=prefix)
    arts = [
        art for art in arts if art.id_int in ipyutils.rdocids
    ]  # remove articles that had empty body (according to their tfidf vector)
    if len(ipyutils.rdocids) != len(arts):
        logging.error(
            "no. of TFIDF document = %i, but there are %i documents in the database (mismatch)"
            % (len(ipyutils.rdocids), len(arts)))
        raise Exception(
            "different size of database/dictionary; version mismatch?")

    cats, rcats = loadMsc2Id(language)  # from buildPure
    #    print "mscs:", cats

    logging.info("loading tfidf collection matrix (for centroids)")
    tfidf = matutils.loadMatrix(
        common.matrixFile('gensim_' + language + 'TFIDF_T.mm')).tocsr()
    logging.debug("loaded %ix%i matrix" % tfidf.shape)

    logging.info("computing centroids")
    centroids = numpy.zeros((len(cats), tfidf.shape[1]), numpy.float)
    #    print "centroids.shape =", centroids.shape
    num = numpy.zeros((len(cats), ), numpy.int)
    artCnt = 0
    for art in arts:
        if not art.id_int in ipyutils.rdocids:
            logging.warning("article not found among docids: %s" % art)
            continue
        artCnt += 1
        artId = ipyutils.rdocids[art.id_int]
        tops = [mscs.niceMSC(msc)[0] for msc in art.msc]
        tops = set(
            tops
        )  # only count each top-level once (comment out this line to count e.g. 30H55 and 30.13 twice for this article, as cat. 30)
        for top in tops:
            mscId = rcats[top]
            vec = tfidf[artId].toarray()
            vec.shape = (vec.size, )
            #            print "vec.shape = ", vec.shape
            centroids[mscId] += vec
            num[mscId] += 1
        if artCnt < 10 or artCnt % 1000 == 0:
            logging.debug(
                "sanity check - article %s has id %i and has mscs=%s, mscsIds=%s"
                % (art.id_int, artId, art.msc,
                   [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc]))
    if not artCnt == tfidf.shape[0]:
        raise Exception("not all articles used; database/matrix mismatch?")
    for i, vec in enumerate(centroids):
        logging.info(
            "centroid for msc %s (id %i) is an average of %i vectors" %
            (cats[i], i, num[i]))
        if numpy.sum(numpy.abs(vec)) == 0:
            logging.warning("empty centroid for msc %s (msc int id %i)" %
                            (cats[i], i))
    for mscId in cats.iterkeys():
        centroids[mscId] /= num[mscId]
    logging.info(
        "used %i articles for %i vectors (articles may have more than one msc and so can be counted more than once)"
        % (artCnt, sum(num)))

    logging.info("computing MSC centroid matrix")
    resultCentroid = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            sim = matutils.cossim(centroids[idi], centroids[idj])
            if numpy.isfinite(sim):
                resultCentroid[idi, idj] = sim
            else:
                resultCentroid[idi, idj] = 0.0

    matutils.saveMatrix(resultCentroid,
                        common.matrixFile("mscs_centroid_%s.mm" % language),
                        sparse=False)
	print mat
	matLum = getLum(mat).astype(numpy.uint8)
	fname = "%s_sim_lum%i_%s.png" % (SIM_TYPE, LUM_SCALE, id)
	logging.debug("saving msc=%s similarity luminescence matrix to %s" % (id, fname))
	logging.info("saving to %s" % fname)
	i = Image.fromarray(matLum, 'L')
	i.save(fname)


logging.info("loading articles")
#arts = docsim.getArts(common.dbFile('serial_eng', '1msc'))
arts = docsim.getArts('/home/radim/workspace/data/dml/results/serial_msc.pdl')

for art in arts:
	art.fullmsc = art.msc[:]
	art.msc = tuple(removeDup([mscs.niceMSC(msc)[0] for msc in art.msc]))
arts = [art for art in arts if art.id_int in ipyutils.rdocids and art.language == "eng" and len(art.msc) == 1]

art2msc = [(art.fullmsc, art.id_int, art.msc) for art in arts]
art2msc.sort()

del arts

logging.info("len(art2msc)=%i" % len(art2msc))
print "first ten art2msc:", art2msc[:10]
print "last ten art2msc:", art2msc[-10:]

art2mscOld = art2msc[:]
art2msc = [(msc, id_int) for fullmsc, id_int, msc in art2msc]

oldMsc = None
def buildMscCentroidMatrix(language):
    logging.info("building MSC centroid matrix from %s" % ARTS_FILE)
    arts = [
        art
        for art in docsim.getArts(ARTS_FILE, acceptNoBody=False, acceptNoMsc=False)
        if art.language == language or language == "any"
    ]
    prefix = "mscs_serial_%s_" % language
    matFile = common.matrixFile(prefix + "TFIDF_T.mm")
    if os.path.exists(matFile):
        logging.warning(
            "SKIPPING creating TFIDF matrix for %s (file %s present). Is this what you wanted?" % (language, matFile)
        )
        tfidf = matutils.loadMatrix(matFile).tocsr()
    else:
        logging.info("creating TFIDF matrix for %s to %s" % (language, matFile))
        tfidf = docsim.buildTFIDFMatrices(arts, prefix=prefix, saveMatrices=False).tocsr()

    ipyutils.loadDicts(prefix=prefix)
    arts = [
        art for art in arts if art.id_int in ipyutils.rdocids
    ]  # remove articles that had empty body (according to their tfidf vector)
    if len(ipyutils.rdocids) != len(arts):
        logging.error(
            "no. of TFIDF document = %i, but there are %i documents in the database (mismatch)"
            % (len(ipyutils.rdocids), len(arts))
        )
        raise Exception("different size of database/dictionary; version mismatch?")

    cats, rcats = loadMsc2Id(language)  # from buildPure
    #    print "mscs:", cats

    logging.info("loading tfidf collection matrix (for centroids)")
    tfidf = matutils.loadMatrix(common.matrixFile("gensim_" + language + "TFIDF_T.mm")).tocsr()
    logging.debug("loaded %ix%i matrix" % tfidf.shape)

    logging.info("computing centroids")
    centroids = numpy.zeros((len(cats), tfidf.shape[1]), numpy.float)
    #    print "centroids.shape =", centroids.shape
    num = numpy.zeros((len(cats),), numpy.int)
    artCnt = 0
    for art in arts:
        if not art.id_int in ipyutils.rdocids:
            logging.warning("article not found among docids: %s" % art)
            continue
        artCnt += 1
        artId = ipyutils.rdocids[art.id_int]
        tops = [mscs.niceMSC(msc)[0] for msc in art.msc]
        tops = set(
            tops
        )  # only count each top-level once (comment out this line to count e.g. 30H55 and 30.13 twice for this article, as cat. 30)
        for top in tops:
            mscId = rcats[top]
            vec = tfidf[artId].toarray()
            vec.shape = (vec.size,)
            #            print "vec.shape = ", vec.shape
            centroids[mscId] += vec
            num[mscId] += 1
        if artCnt < 10 or artCnt % 1000 == 0:
            logging.debug(
                "sanity check - article %s has id %i and has mscs=%s, mscsIds=%s"
                % (art.id_int, artId, art.msc, [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc])
            )
    if not artCnt == tfidf.shape[0]:
        raise Exception("not all articles used; database/matrix mismatch?")
    for i, vec in enumerate(centroids):
        logging.info("centroid for msc %s (id %i) is an average of %i vectors" % (cats[i], i, num[i]))
        if numpy.sum(numpy.abs(vec)) == 0:
            logging.warning("empty centroid for msc %s (msc int id %i)" % (cats[i], i))
    for mscId in cats.iterkeys():
        centroids[mscId] /= num[mscId]
    logging.info(
        "used %i articles for %i vectors (articles may have more than one msc and so can be counted more than once)"
        % (artCnt, sum(num))
    )

    logging.info("computing MSC centroid matrix")
    resultCentroid = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            sim = matutils.cossim(centroids[idi], centroids[idj])
            if numpy.isfinite(sim):
                resultCentroid[idi, idj] = sim
            else:
                resultCentroid[idi, idj] = 0.0

    matutils.saveMatrix(resultCentroid, common.matrixFile("mscs_centroid_%s.mm" % language), sparse=False)
def getArtsWithMsc(mscCode):
    import mscs
    mscDict = dict([(art.id_int, [mscs.niceMSC(msc, prefix = len(mscCode))[0] for msc in art.msc]) for art in arts]) # create mapping of internal id -> list of top category MSCs    
    return [art.id_int for art in arts if mscCode in mscDict[art.id_int]]