def getArts(fname, minFreq): db = ArticleDB.ArticleDB(common.dataFile(fname), mode = 'open') import ipyutils language = 'eng' ipyutils.loadDicts(prefix = 'gensim_' + language) arts = [Article.Article(rec) for rec in db.db if rec['language'] == language] for art in arts: art.msc = list(set(mscs.niceMSC(msc, prefix = 2)[0] for msc in art.msc)) logging.info('loaded %i articles from %s' % (len(arts), fname)) arts = [art for art in arts if len(art.msc)==1 and art.body and art.id_int in ipyutils.rdocids] logging.info('extracted %i articles with exactly one MSC and non-empty body' % (len(arts))) okmsc = mscs.getFreqMSCs(arts, minFreq, useMains = True) mscs.printStats(arts) for art in arts: art.body = art.body.decode('utf8') # art.body = art.body.decode('ascii', 'ignore') art.body = art.body.translate(BAD_MAP).encode('utf8') if art.title: art.title = art.title.decode('utf8') # art.title = art.title.decode('ascii', 'ignore') art.title = art.title.translate(BAD_MAP).encode('utf8') art.msc = [mscs.niceMSC(art.msc[0], prefix = 2)[0]] arts = [art for art in arts if art.msc[0] in okmsc] allmsc, mainmsc = mscs.getMSCCnts(arts) for msc, mscarts in allmsc.iteritems(): logging.info("class %s: %i articles" % (msc, len(mscarts))) logging.info("======================") logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues())) logging.debug('using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq)) return arts
def buildMscOverlapMatrix(lang): logging.info("building MSC overlap matrix") arts = [ art for art in docsim.getArts( ARTS_FILE, acceptNoBody=True, acceptNoMsc=False) if art.language == lang or lang == 'any' ] cats, rcats = loadMsc2Id(lang) # from buildPure logging.info("computing MSC matrices") overlap = numpy.zeros( (len(cats), len(cats)), dtype=int ) # binary msc similarity (with fixed msc hierarchy = identity) for art in arts: for msc1 in art.msc: for msc2 in art.msc: overlap[rcats[mscs.niceMSC(msc1)[0]], rcats[mscs.niceMSC(msc2)[0]]] += 1 resultOverlap = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32) for i in xrange(resultOverlap.shape[0]): max = numpy.max(overlap[i]) for j in xrange(resultOverlap.shape[1]): resultOverlap[i, j] = math.log(1.0 + 100.0 * overlap[i, j] / max) / math.log(101) matutils.saveMatrix(resultOverlap, common.matrixFile("mscs_overlap_%s.mm" % lang), sparse=False) return resultOverlap
def buildMscOverlapMatrix(lang): logging.info("building MSC overlap matrix") arts = [ art for art in docsim.getArts(ARTS_FILE, acceptNoBody=True, acceptNoMsc=False) if art.language == lang or lang == "any" ] cats, rcats = loadMsc2Id(lang) # from buildPure logging.info("computing MSC matrices") overlap = numpy.zeros( (len(cats), len(cats)), dtype=int ) # binary msc similarity (with fixed msc hierarchy = identity) for art in arts: for msc1 in art.msc: for msc2 in art.msc: overlap[rcats[mscs.niceMSC(msc1)[0]], rcats[mscs.niceMSC(msc2)[0]]] += 1 resultOverlap = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32) for i in xrange(resultOverlap.shape[0]): max = numpy.max(overlap[i]) for j in xrange(resultOverlap.shape[1]): resultOverlap[i, j] = math.log(1.0 + 100.0 * overlap[i, j] / max) / math.log(101) matutils.saveMatrix(resultOverlap, common.matrixFile("mscs_overlap_%s.mm" % lang), sparse=False) return resultOverlap
def buildDocDoc(arts, type, language): ipyutils.loadDicts(prefix="gensim_" + language) arts = [art for art in arts if art.id_int in ipyutils.rdocids] assert len(arts) == len(ipyutils.rdocids) logging.info("loading msc<->id mapping") cats, rcats = loadMsc2Id(language) mscsFile = common.matrixFile("mscs_%s.mm" % type) matMsc = matutils.loadMatrix(mscsFile) mscDict = {} for art in arts: artId = ipyutils.rdocids[art.id_int] mscIds = [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc] mscDict[artId] = mscIds logging.info("computing doc*doc similarity matrix based on %s" % mscsFile) docdoc = numpy.zeros((len(arts), len(arts)), numpy.float32) for i in xrange(len(arts)): if i % 100 == 0: logging.info("PROGRESS: %i/%i" % (i, len(arts))) art1Id = ipyutils.rdocids[arts[i].id_int] for j in xrange(i, len(arts)): art2Id = ipyutils.rdocids[arts[j].id_int] bestScore = 0.0 for msc1Id in mscDict[art1Id]: for msc2Id in mscDict[art2Id]: bestScore = max(bestScore, matMsc[msc1Id, msc2Id]) docdoc[art1Id, art2Id] = docdoc[art2Id, art1Id] = bestScore matutils.saveMatrix(docdoc, common.matrixFile("docdoc_" + language + "_%s.mm" % type), sparse=False) return docdoc
def getArtsWithMsc(mscCode): import mscs mscDict = dict([ (art.id_int, [mscs.niceMSC(msc, prefix=len(mscCode))[0] for msc in art.msc]) for art in arts ]) # create mapping of internal id -> list of top category MSCs return [art.id_int for art in arts if mscCode in mscDict[art.id_int]]
def getArts(fname, minFreq): db = ArticleDB.ArticleDB(common.dataFile(fname), mode='open') import ipyutils language = 'eng' ipyutils.loadDicts(prefix='gensim_' + language) arts = [ Article.Article(rec) for rec in db.db if rec['language'] == language ] for art in arts: art.msc = list(set(mscs.niceMSC(msc, prefix=2)[0] for msc in art.msc)) logging.info('loaded %i articles from %s' % (len(arts), fname)) arts = [ art for art in arts if len(art.msc) == 1 and art.body and art.id_int in ipyutils.rdocids ] logging.info( 'extracted %i articles with exactly one MSC and non-empty body' % (len(arts))) okmsc = mscs.getFreqMSCs(arts, minFreq, useMains=True) mscs.printStats(arts) for art in arts: art.body = art.body.decode('utf8') # art.body = art.body.decode('ascii', 'ignore') art.body = art.body.translate(BAD_MAP).encode('utf8') if art.title: art.title = art.title.decode('utf8') # art.title = art.title.decode('ascii', 'ignore') art.title = art.title.translate(BAD_MAP).encode('utf8') art.msc = [mscs.niceMSC(art.msc[0], prefix=2)[0]] arts = [art for art in arts if art.msc[0] in okmsc] allmsc, mainmsc = mscs.getMSCCnts(arts) for msc, mscarts in allmsc.iteritems(): logging.info("class %s: %i articles" % (msc, len(mscarts))) logging.info("======================") logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues())) logging.debug( 'using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq)) return arts
def toNLTK(_mat, prefix=2): import mscs if scipy.sparse.issparse(_mat): mat = _mat.tocsc() else: mat = _mat result = [] for i in xrange(mat.shape[1]): if scipy.sparse.issparse(mat): a0 = mat[:, i].toarray() else: a0 = mat[:, i] nnind = a0.nonzero()[0] nnvals = a0.take(nnind) features = dict(zip(nnind, nnvals)) id = docids[i] okarts = [a for a in arts if a.id_int == id] if len(okarts) != 1: raise Exception('%i articles with id=%s' % repr(id)) labels = okarts[0].msc if len(labels) < 1: raise Exception('no msc for %s' % id) result.append((features, mscs.niceMSC(labels[0], prefix=prefix)[0])) return result
def buildDocDoc(arts, type, language): ipyutils.loadDicts(prefix='gensim_' + language) arts = [art for art in arts if art.id_int in ipyutils.rdocids] assert (len(arts) == len(ipyutils.rdocids)) logging.info("loading msc<->id mapping") cats, rcats = loadMsc2Id(language) mscsFile = common.matrixFile("mscs_%s.mm" % type) matMsc = matutils.loadMatrix(mscsFile) mscDict = {} for art in arts: artId = ipyutils.rdocids[art.id_int] mscIds = [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc] mscDict[artId] = mscIds logging.info("computing doc*doc similarity matrix based on %s" % mscsFile) docdoc = numpy.zeros((len(arts), len(arts)), numpy.float32) for i in xrange(len(arts)): if i % 100 == 0: logging.info("PROGRESS: %i/%i" % (i, len(arts))) art1Id = ipyutils.rdocids[arts[i].id_int] for j in xrange(i, len(arts)): art2Id = ipyutils.rdocids[arts[j].id_int] bestScore = 0.0 for msc1Id in mscDict[art1Id]: for msc2Id in mscDict[art2Id]: bestScore = max(bestScore, matMsc[msc1Id, msc2Id]) docdoc[art1Id, art2Id] = docdoc[art2Id, art1Id] = bestScore matutils.saveMatrix(docdoc, common.matrixFile("docdoc_" + language + "_%s.mm" % type), sparse=False) return docdoc
def toNLTK(_mat, prefix = 2): import mscs if scipy.sparse.issparse(_mat): mat = _mat.tocsc() else: mat = _mat result = [] for i in xrange(mat.shape[1]): if scipy.sparse.issparse(mat): a0 = mat[:, i].toarray() else: a0 = mat[:, i] nnind = a0.nonzero()[0] nnvals = a0.take(nnind) features = dict(zip(nnind, nnvals)) id = docids[i] okarts = [a for a in arts if a.id_int == id] if len(okarts) != 1: raise Exception('%i articles with id=%s' % repr(id)) labels = okarts[0].msc if len(labels) < 1: raise Exception('no msc for %s' % id) result.append((features, mscs.niceMSC(labels[0], prefix = prefix)[0])) return result
matLum = getLum(mat).astype(numpy.uint8) fname = "%s_sim_lum%i_%s.png" % (SIM_TYPE, LUM_SCALE, id) logging.debug("saving msc=%s similarity luminescence matrix to %s" % (id, fname)) logging.info("saving to %s" % fname) i = Image.fromarray(matLum, 'L') i.save(fname) logging.info("loading articles") #arts = docsim.getArts(common.dbFile('serial_eng', '1msc')) arts = docsim.getArts('/home/radim/workspace/data/dml/results/serial_msc.pdl') for art in arts: art.fullmsc = art.msc[:] art.msc = tuple(removeDup([mscs.niceMSC(msc)[0] for msc in art.msc])) arts = [ art for art in arts if art.id_int in ipyutils.rdocids and art.language == "eng" and len(art.msc) == 1 ] art2msc = [(art.fullmsc, art.id_int, art.msc) for art in arts] art2msc.sort() del arts logging.info("len(art2msc)=%i" % len(art2msc)) print "first ten art2msc:", art2msc[:10] print "last ten art2msc:", art2msc[-10:] art2mscOld = art2msc[:]
def buildMscCentroidMatrix(language): logging.info("building MSC centroid matrix from %s" % ARTS_FILE) arts = [ art for art in docsim.getArts( ARTS_FILE, acceptNoBody=False, acceptNoMsc=False) if art.language == language or language == 'any' ] prefix = 'mscs_serial_%s_' % language matFile = common.matrixFile(prefix + 'TFIDF_T.mm') if os.path.exists(matFile): logging.warning( 'SKIPPING creating TFIDF matrix for %s (file %s present). Is this what you wanted?' % (language, matFile)) tfidf = matutils.loadMatrix(matFile).tocsr() else: logging.info('creating TFIDF matrix for %s to %s' % (language, matFile)) tfidf = docsim.buildTFIDFMatrices(arts, prefix=prefix, saveMatrices=False).tocsr() ipyutils.loadDicts(prefix=prefix) arts = [ art for art in arts if art.id_int in ipyutils.rdocids ] # remove articles that had empty body (according to their tfidf vector) if len(ipyutils.rdocids) != len(arts): logging.error( "no. of TFIDF document = %i, but there are %i documents in the database (mismatch)" % (len(ipyutils.rdocids), len(arts))) raise Exception( "different size of database/dictionary; version mismatch?") cats, rcats = loadMsc2Id(language) # from buildPure # print "mscs:", cats logging.info("loading tfidf collection matrix (for centroids)") tfidf = matutils.loadMatrix( common.matrixFile('gensim_' + language + 'TFIDF_T.mm')).tocsr() logging.debug("loaded %ix%i matrix" % tfidf.shape) logging.info("computing centroids") centroids = numpy.zeros((len(cats), tfidf.shape[1]), numpy.float) # print "centroids.shape =", centroids.shape num = numpy.zeros((len(cats), ), numpy.int) artCnt = 0 for art in arts: if not art.id_int in ipyutils.rdocids: logging.warning("article not found among docids: %s" % art) continue artCnt += 1 artId = ipyutils.rdocids[art.id_int] tops = [mscs.niceMSC(msc)[0] for msc in art.msc] tops = set( tops ) # only count each top-level once (comment out this line to count e.g. 30H55 and 30.13 twice for this article, as cat. 30) for top in tops: mscId = rcats[top] vec = tfidf[artId].toarray() vec.shape = (vec.size, ) # print "vec.shape = ", vec.shape centroids[mscId] += vec num[mscId] += 1 if artCnt < 10 or artCnt % 1000 == 0: logging.debug( "sanity check - article %s has id %i and has mscs=%s, mscsIds=%s" % (art.id_int, artId, art.msc, [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc])) if not artCnt == tfidf.shape[0]: raise Exception("not all articles used; database/matrix mismatch?") for i, vec in enumerate(centroids): logging.info( "centroid for msc %s (id %i) is an average of %i vectors" % (cats[i], i, num[i])) if numpy.sum(numpy.abs(vec)) == 0: logging.warning("empty centroid for msc %s (msc int id %i)" % (cats[i], i)) for mscId in cats.iterkeys(): centroids[mscId] /= num[mscId] logging.info( "used %i articles for %i vectors (articles may have more than one msc and so can be counted more than once)" % (artCnt, sum(num))) logging.info("computing MSC centroid matrix") resultCentroid = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32) for idi, cati in cats.iteritems(): for idj, catj in cats.iteritems(): # print idi, cati, idj, catj sim = matutils.cossim(centroids[idi], centroids[idj]) if numpy.isfinite(sim): resultCentroid[idi, idj] = sim else: resultCentroid[idi, idj] = 0.0 matutils.saveMatrix(resultCentroid, common.matrixFile("mscs_centroid_%s.mm" % language), sparse=False)
print mat matLum = getLum(mat).astype(numpy.uint8) fname = "%s_sim_lum%i_%s.png" % (SIM_TYPE, LUM_SCALE, id) logging.debug("saving msc=%s similarity luminescence matrix to %s" % (id, fname)) logging.info("saving to %s" % fname) i = Image.fromarray(matLum, 'L') i.save(fname) logging.info("loading articles") #arts = docsim.getArts(common.dbFile('serial_eng', '1msc')) arts = docsim.getArts('/home/radim/workspace/data/dml/results/serial_msc.pdl') for art in arts: art.fullmsc = art.msc[:] art.msc = tuple(removeDup([mscs.niceMSC(msc)[0] for msc in art.msc])) arts = [art for art in arts if art.id_int in ipyutils.rdocids and art.language == "eng" and len(art.msc) == 1] art2msc = [(art.fullmsc, art.id_int, art.msc) for art in arts] art2msc.sort() del arts logging.info("len(art2msc)=%i" % len(art2msc)) print "first ten art2msc:", art2msc[:10] print "last ten art2msc:", art2msc[-10:] art2mscOld = art2msc[:] art2msc = [(msc, id_int) for fullmsc, id_int, msc in art2msc] oldMsc = None
def buildMscCentroidMatrix(language): logging.info("building MSC centroid matrix from %s" % ARTS_FILE) arts = [ art for art in docsim.getArts(ARTS_FILE, acceptNoBody=False, acceptNoMsc=False) if art.language == language or language == "any" ] prefix = "mscs_serial_%s_" % language matFile = common.matrixFile(prefix + "TFIDF_T.mm") if os.path.exists(matFile): logging.warning( "SKIPPING creating TFIDF matrix for %s (file %s present). Is this what you wanted?" % (language, matFile) ) tfidf = matutils.loadMatrix(matFile).tocsr() else: logging.info("creating TFIDF matrix for %s to %s" % (language, matFile)) tfidf = docsim.buildTFIDFMatrices(arts, prefix=prefix, saveMatrices=False).tocsr() ipyutils.loadDicts(prefix=prefix) arts = [ art for art in arts if art.id_int in ipyutils.rdocids ] # remove articles that had empty body (according to their tfidf vector) if len(ipyutils.rdocids) != len(arts): logging.error( "no. of TFIDF document = %i, but there are %i documents in the database (mismatch)" % (len(ipyutils.rdocids), len(arts)) ) raise Exception("different size of database/dictionary; version mismatch?") cats, rcats = loadMsc2Id(language) # from buildPure # print "mscs:", cats logging.info("loading tfidf collection matrix (for centroids)") tfidf = matutils.loadMatrix(common.matrixFile("gensim_" + language + "TFIDF_T.mm")).tocsr() logging.debug("loaded %ix%i matrix" % tfidf.shape) logging.info("computing centroids") centroids = numpy.zeros((len(cats), tfidf.shape[1]), numpy.float) # print "centroids.shape =", centroids.shape num = numpy.zeros((len(cats),), numpy.int) artCnt = 0 for art in arts: if not art.id_int in ipyutils.rdocids: logging.warning("article not found among docids: %s" % art) continue artCnt += 1 artId = ipyutils.rdocids[art.id_int] tops = [mscs.niceMSC(msc)[0] for msc in art.msc] tops = set( tops ) # only count each top-level once (comment out this line to count e.g. 30H55 and 30.13 twice for this article, as cat. 30) for top in tops: mscId = rcats[top] vec = tfidf[artId].toarray() vec.shape = (vec.size,) # print "vec.shape = ", vec.shape centroids[mscId] += vec num[mscId] += 1 if artCnt < 10 or artCnt % 1000 == 0: logging.debug( "sanity check - article %s has id %i and has mscs=%s, mscsIds=%s" % (art.id_int, artId, art.msc, [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc]) ) if not artCnt == tfidf.shape[0]: raise Exception("not all articles used; database/matrix mismatch?") for i, vec in enumerate(centroids): logging.info("centroid for msc %s (id %i) is an average of %i vectors" % (cats[i], i, num[i])) if numpy.sum(numpy.abs(vec)) == 0: logging.warning("empty centroid for msc %s (msc int id %i)" % (cats[i], i)) for mscId in cats.iterkeys(): centroids[mscId] /= num[mscId] logging.info( "used %i articles for %i vectors (articles may have more than one msc and so can be counted more than once)" % (artCnt, sum(num)) ) logging.info("computing MSC centroid matrix") resultCentroid = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32) for idi, cati in cats.iteritems(): for idj, catj in cats.iteritems(): # print idi, cati, idj, catj sim = matutils.cossim(centroids[idi], centroids[idj]) if numpy.isfinite(sim): resultCentroid[idi, idj] = sim else: resultCentroid[idi, idj] = 0.0 matutils.saveMatrix(resultCentroid, common.matrixFile("mscs_centroid_%s.mm" % language), sparse=False)
def getArtsWithMsc(mscCode): import mscs mscDict = dict([(art.id_int, [mscs.niceMSC(msc, prefix = len(mscCode))[0] for msc in art.msc]) for art in arts]) # create mapping of internal id -> list of top category MSCs return [art.id_int for art in arts if mscCode in mscDict[art.id_int]]