import os.path
import gc

import common
import iddb
import docsim


if __name__ == '__main__':
    logging.basicConfig(level = common.PRINT_LEVEL)
    logging.root.level = common.PRINT_LEVEL
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 2:
        print globals()['__doc__'] % (program)
        sys.exit(1)
    language = sys.argv[1]
    inputs = common.INPUT_PATHS
    prefix = common.PREFIX
    
    # merge databases into one, keeping only articles in the specified language (or 'any' to keep all languages)
    iddb.merge(inputs, prefix, language)
    
    # build and store tfidf matrix
    docsim.buildTFIDFMatrices(dbFile = common.dbFile(prefix, language), prefix = prefix + '_' + language, contentType = 'alphanum_nohtml', saveMatrices = False)
    
    logging.info("finished running %s" % program)
Example #2
0
def buildMscCentroidMatrix(language):
    logging.info("building MSC centroid matrix from %s" % ARTS_FILE)
    arts = [
        art for art in docsim.getArts(
            ARTS_FILE, acceptNoBody=False, acceptNoMsc=False)
        if art.language == language or language == 'any'
    ]
    prefix = 'mscs_serial_%s_' % language
    matFile = common.matrixFile(prefix + 'TFIDF_T.mm')
    if os.path.exists(matFile):
        logging.warning(
            'SKIPPING creating TFIDF matrix for %s (file %s present). Is this what you wanted?'
            % (language, matFile))
        tfidf = matutils.loadMatrix(matFile).tocsr()
    else:
        logging.info('creating TFIDF matrix for %s to %s' %
                     (language, matFile))
        tfidf = docsim.buildTFIDFMatrices(arts,
                                          prefix=prefix,
                                          saveMatrices=False).tocsr()

    ipyutils.loadDicts(prefix=prefix)
    arts = [
        art for art in arts if art.id_int in ipyutils.rdocids
    ]  # remove articles that had empty body (according to their tfidf vector)
    if len(ipyutils.rdocids) != len(arts):
        logging.error(
            "no. of TFIDF document = %i, but there are %i documents in the database (mismatch)"
            % (len(ipyutils.rdocids), len(arts)))
        raise Exception(
            "different size of database/dictionary; version mismatch?")

    cats, rcats = loadMsc2Id(language)  # from buildPure
    #    print "mscs:", cats

    logging.info("loading tfidf collection matrix (for centroids)")
    tfidf = matutils.loadMatrix(
        common.matrixFile('gensim_' + language + 'TFIDF_T.mm')).tocsr()
    logging.debug("loaded %ix%i matrix" % tfidf.shape)

    logging.info("computing centroids")
    centroids = numpy.zeros((len(cats), tfidf.shape[1]), numpy.float)
    #    print "centroids.shape =", centroids.shape
    num = numpy.zeros((len(cats), ), numpy.int)
    artCnt = 0
    for art in arts:
        if not art.id_int in ipyutils.rdocids:
            logging.warning("article not found among docids: %s" % art)
            continue
        artCnt += 1
        artId = ipyutils.rdocids[art.id_int]
        tops = [mscs.niceMSC(msc)[0] for msc in art.msc]
        tops = set(
            tops
        )  # only count each top-level once (comment out this line to count e.g. 30H55 and 30.13 twice for this article, as cat. 30)
        for top in tops:
            mscId = rcats[top]
            vec = tfidf[artId].toarray()
            vec.shape = (vec.size, )
            #            print "vec.shape = ", vec.shape
            centroids[mscId] += vec
            num[mscId] += 1
        if artCnt < 10 or artCnt % 1000 == 0:
            logging.debug(
                "sanity check - article %s has id %i and has mscs=%s, mscsIds=%s"
                % (art.id_int, artId, art.msc,
                   [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc]))
    if not artCnt == tfidf.shape[0]:
        raise Exception("not all articles used; database/matrix mismatch?")
    for i, vec in enumerate(centroids):
        logging.info(
            "centroid for msc %s (id %i) is an average of %i vectors" %
            (cats[i], i, num[i]))
        if numpy.sum(numpy.abs(vec)) == 0:
            logging.warning("empty centroid for msc %s (msc int id %i)" %
                            (cats[i], i))
    for mscId in cats.iterkeys():
        centroids[mscId] /= num[mscId]
    logging.info(
        "used %i articles for %i vectors (articles may have more than one msc and so can be counted more than once)"
        % (artCnt, sum(num)))

    logging.info("computing MSC centroid matrix")
    resultCentroid = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            sim = matutils.cossim(centroids[idi], centroids[idj])
            if numpy.isfinite(sim):
                resultCentroid[idi, idj] = sim
            else:
                resultCentroid[idi, idj] = 0.0

    matutils.saveMatrix(resultCentroid,
                        common.matrixFile("mscs_centroid_%s.mm" % language),
                        sparse=False)
Example #3
0
import common
import iddb
import docsim

if __name__ == '__main__':
    logging.basicConfig(level=common.PRINT_LEVEL)
    logging.root.level = common.PRINT_LEVEL
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 2:
        print globals()['__doc__'] % (program)
        sys.exit(1)
    language = sys.argv[1]
    inputs = common.INPUT_PATHS
    prefix = common.PREFIX

    # merge databases into one, keeping only articles in the specified language (or 'any' to keep all languages)
    iddb.merge(inputs, prefix, language)

    # build and store tfidf matrix
    docsim.buildTFIDFMatrices(dbFile=common.dbFile(prefix, language),
                              prefix=prefix + '_' + language,
                              contentType='alphanum_nohtml',
                              saveMatrices=False)

    logging.info("finished running %s" % program)
def buildMscCentroidMatrix(language):
    logging.info("building MSC centroid matrix from %s" % ARTS_FILE)
    arts = [
        art
        for art in docsim.getArts(ARTS_FILE, acceptNoBody=False, acceptNoMsc=False)
        if art.language == language or language == "any"
    ]
    prefix = "mscs_serial_%s_" % language
    matFile = common.matrixFile(prefix + "TFIDF_T.mm")
    if os.path.exists(matFile):
        logging.warning(
            "SKIPPING creating TFIDF matrix for %s (file %s present). Is this what you wanted?" % (language, matFile)
        )
        tfidf = matutils.loadMatrix(matFile).tocsr()
    else:
        logging.info("creating TFIDF matrix for %s to %s" % (language, matFile))
        tfidf = docsim.buildTFIDFMatrices(arts, prefix=prefix, saveMatrices=False).tocsr()

    ipyutils.loadDicts(prefix=prefix)
    arts = [
        art for art in arts if art.id_int in ipyutils.rdocids
    ]  # remove articles that had empty body (according to their tfidf vector)
    if len(ipyutils.rdocids) != len(arts):
        logging.error(
            "no. of TFIDF document = %i, but there are %i documents in the database (mismatch)"
            % (len(ipyutils.rdocids), len(arts))
        )
        raise Exception("different size of database/dictionary; version mismatch?")

    cats, rcats = loadMsc2Id(language)  # from buildPure
    #    print "mscs:", cats

    logging.info("loading tfidf collection matrix (for centroids)")
    tfidf = matutils.loadMatrix(common.matrixFile("gensim_" + language + "TFIDF_T.mm")).tocsr()
    logging.debug("loaded %ix%i matrix" % tfidf.shape)

    logging.info("computing centroids")
    centroids = numpy.zeros((len(cats), tfidf.shape[1]), numpy.float)
    #    print "centroids.shape =", centroids.shape
    num = numpy.zeros((len(cats),), numpy.int)
    artCnt = 0
    for art in arts:
        if not art.id_int in ipyutils.rdocids:
            logging.warning("article not found among docids: %s" % art)
            continue
        artCnt += 1
        artId = ipyutils.rdocids[art.id_int]
        tops = [mscs.niceMSC(msc)[0] for msc in art.msc]
        tops = set(
            tops
        )  # only count each top-level once (comment out this line to count e.g. 30H55 and 30.13 twice for this article, as cat. 30)
        for top in tops:
            mscId = rcats[top]
            vec = tfidf[artId].toarray()
            vec.shape = (vec.size,)
            #            print "vec.shape = ", vec.shape
            centroids[mscId] += vec
            num[mscId] += 1
        if artCnt < 10 or artCnt % 1000 == 0:
            logging.debug(
                "sanity check - article %s has id %i and has mscs=%s, mscsIds=%s"
                % (art.id_int, artId, art.msc, [rcats[mscs.niceMSC(msc)[0]] for msc in art.msc])
            )
    if not artCnt == tfidf.shape[0]:
        raise Exception("not all articles used; database/matrix mismatch?")
    for i, vec in enumerate(centroids):
        logging.info("centroid for msc %s (id %i) is an average of %i vectors" % (cats[i], i, num[i]))
        if numpy.sum(numpy.abs(vec)) == 0:
            logging.warning("empty centroid for msc %s (msc int id %i)" % (cats[i], i))
    for mscId in cats.iterkeys():
        centroids[mscId] /= num[mscId]
    logging.info(
        "used %i articles for %i vectors (articles may have more than one msc and so can be counted more than once)"
        % (artCnt, sum(num))
    )

    logging.info("computing MSC centroid matrix")
    resultCentroid = numpy.zeros((len(cats), len(cats)), dtype=numpy.float32)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            sim = matutils.cossim(centroids[idi], centroids[idj])
            if numpy.isfinite(sim):
                resultCentroid[idi, idj] = sim
            else:
                resultCentroid[idi, idj] = 0.0

    matutils.saveMatrix(resultCentroid, common.matrixFile("mscs_centroid_%s.mm" % language), sparse=False)