Beispiel #1
0
def buildMscPureMatrix(lang):
    if os.path.exists(ARTS_FILE):
        logging.warning(
            "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?"
            % ARTS_FILE)
    else:
        logging.info("creating MSC database from meta.xml files")
        createMscsDb(
        )  # only run this when collection changes (test for file existence / delete file explicitly?)
    arts = [
        art for art in docsim.getArts(
            ARTS_FILE, acceptNoBody=True, acceptNoMsc=False)
        if art.language == lang or lang == 'any'
    ]

    cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False)))
    rcats = utils_dml.reverseMap(cats)
    saveMsc2Id(cats, lang)

    logging.info("building MSC binary matrix")
    resultBin = numpy.zeros(
        (len(cats), len(cats)), dtype=numpy.float32
    )  # binary msc similarity (with fixed msc hierarchy = identity)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            if idi == idj:
                resultBin[idi, idj] = 1.0
            else:
                resultBin[idi, idj] = 0.0
    matutils.saveMatrix(resultBin,
                        common.matrixFile("mscs_bin_%s.mm" % lang),
                        sparse=False)
    return resultBin
    def getBOWMatrix(self): # FIXME not sparse atm; scipy.sparse is crap
        """return collection as sparse term-by-document matrix (index order is array[term,document]=frequency)"""

        dictionary = self.getDictionary()
        docs = self.getDocs()
        numterms = max(dictionary.values()) + 1
        logging.info("constructing %ix%i BOW matrix" % (numterms, len(docs)))

        # build the matrix
        result = numpy.empty((numterms, len(docs)), dtype = int)

        for i in xrange(len(docs)):
            result[:, i] = docs[i].getFull(length = numterms)
##        result = numpy.column_stack([doc.getFull() for doc in self.getDocs()])
        
        # print some stats
        reverseDictionary = utils_dml.reverseMap(dictionary)
        docids = self.getDocIds()
        marginal = [numpy.sum(result, axis = ax) for ax in range(2)] # marginal sums along either axis
        empty = [numpy.compress(marginal[ax] == 0, range(len(marginal[ax]))) for ax in range(2)] # indices of empty columns/rows
        logging.info("%i empty BOW document vectors: " % len(empty[0]) + str(zip(empty[0], map(docids.__getitem__, empty[0]))))
        logging.info("%i empty BOW term vectors:" % len(empty[1]) + str(zip(empty[1], map(reverseDictionary.get, empty[1]))))
        zeroElems = len(result.nonzero()[0])
        logging.info("BOW sparsity: %i/%i = %.3f%%" % (zeroElems, result.size, 100.0 * zeroElems / result.size))
        return result
Beispiel #3
0
    def getBOWMatrix(self):  # FIXME not sparse atm; scipy.sparse is crap
        """return collection as sparse term-by-document matrix (index order is array[term,document]=frequency)"""

        dictionary = self.getDictionary()
        docs = self.getDocs()
        numterms = max(dictionary.values()) + 1
        logging.info("constructing %ix%i BOW matrix" % (numterms, len(docs)))

        # build the matrix
        result = numpy.empty((numterms, len(docs)), dtype=int)

        for i in xrange(len(docs)):
            result[:, i] = docs[i].getFull(length=numterms)


##        result = numpy.column_stack([doc.getFull() for doc in self.getDocs()])

# print some stats
        reverseDictionary = utils_dml.reverseMap(dictionary)
        docids = self.getDocIds()
        marginal = [numpy.sum(result, axis=ax)
                    for ax in range(2)]  # marginal sums along either axis
        empty = [
            numpy.compress(marginal[ax] == 0, range(len(marginal[ax])))
            for ax in range(2)
        ]  # indices of empty columns/rows
        logging.info("%i empty BOW document vectors: " % len(empty[0]) +
                     str(zip(empty[0], map(docids.__getitem__, empty[0]))))
        logging.info("%i empty BOW term vectors:" % len(empty[1]) +
                     str(zip(empty[1], map(reverseDictionary.get, empty[1]))))
        zeroElems = len(result.nonzero()[0])
        logging.info("BOW sparsity: %i/%i = %.3f%%" %
                     (zeroElems, result.size, 100.0 * zeroElems / result.size))
        return result
def buildMscPureMatrix(lang):
    if os.path.exists(ARTS_FILE):
        logging.warning(
            "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?" % ARTS_FILE
        )
    else:
        logging.info("creating MSC database from meta.xml files")
        createMscsDb()  # only run this when collection changes (test for file existence / delete file explicitly?)
    arts = [
        art
        for art in docsim.getArts(ARTS_FILE, acceptNoBody=True, acceptNoMsc=False)
        if art.language == lang or lang == "any"
    ]

    cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False)))
    rcats = utils_dml.reverseMap(cats)
    saveMsc2Id(cats, lang)

    logging.info("building MSC binary matrix")
    resultBin = numpy.zeros(
        (len(cats), len(cats)), dtype=numpy.float32
    )  # binary msc similarity (with fixed msc hierarchy = identity)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            if idi == idj:
                resultBin[idi, idj] = 1.0
            else:
                resultBin[idi, idj] = 0.0
    matutils.saveMatrix(resultBin, common.matrixFile("mscs_bin_%s.mm" % lang), sparse=False)
    return resultBin
 def rebuildDictionary(self):
     """remove entries which have no corresponding ids in the database, shrink resulting gaps"""
     logging.debug("rebuilding dictionary, shrinking gaps")
     uids = sorted(self.getUniqueTokens()) # get unique ids present in the database
     newids = range(len(uids)) # new ids
     revold = utils_dml.reverseMap(self.getDictionary())
     idmap = dict(zip(uids, newids))
     newdict = {}
     for uid in uids: # generate new dictionary
         newdict[revold[uid]] = idmap[uid]
     for doc in self.getDocs(): # update ids in documents to reflect the new dictionary
         doc.setTokenIds([idmap[i] for i in doc.getTokenIds()])
     self.dictionary = newdict
Beispiel #6
0
 def rebuildDictionary(self):
     """remove entries which have no corresponding ids in the database, shrink resulting gaps"""
     logging.debug("rebuilding dictionary, shrinking gaps")
     uids = sorted(
         self.getUniqueTokens())  # get unique ids present in the database
     newids = range(len(uids))  # new ids
     revold = utils_dml.reverseMap(self.getDictionary())
     idmap = dict(zip(uids, newids))
     newdict = {}
     for uid in uids:  # generate new dictionary
         newdict[revold[uid]] = idmap[uid]
     for doc in self.getDocs(
     ):  # update ids in documents to reflect the new dictionary
         doc.setTokenIds([idmap[i] for i in doc.getTokenIds()])
     self.dictionary = newdict
    def getTFIDFMatrix(self, sparse = False):
        """construct tf*idf collection matrix. """

        docs = self.getDocs()
        dictionary = self.getDictionary()
        if not dictionary:
            raise Exception("cannot build matrix from an empty collection; chances are something went wrong!")
        numterms = max(dictionary.values()) + 1

        logging.info("constructing %ix%i TFIDF matrix, sparse = %s" % (numterms, len(docs), sparse))

        if sparse:
            result = scipy.sparse.lil_matrix((numterms, len(docs)), dtype = numpy.float32)
            for i in xrange(len(docs)):
                if (i + 1) % 1000 == 0:
                    logging.info(" progress: at vector #%i/%i" % (i, len(docs)))
                vec, total = utils_dml.vect2bow_sorted(docs[i].getTokenIds())
                if total > 0:
                    itotal = 1.0 / total
                    for id, idfreq in vec:
                        result.rows[id].append(i) # vec must be sorted by id in order for this to work!
                        result.data[id].append(itotal * idfreq)
            nnz = numpy.array([len(result.data[j]) for j in xrange(result.shape[0])])
            idfs = numpy.log((1.0 * result.shape[1]) / nnz) / numpy.log(2)
            idfs[numpy.isinf(idfs)] = 0.0 # HAX replace INFs with 0.0 - is this ok?
            self.setIDFs(idfs)
            for iterm in xrange(result.shape[0]): # multiply by IDF
                if (iterm + 1) % 10000 == 0:
                    logging.info(" progress: idf#%i" % iterm)
                result.data[iterm] = [val * idfs[iterm] for val in result.data[iterm]]
            result = result.tocsc()
            logging.info("TFIDF sparsity: %i/%i = %.3f%%" % (result.getnnz(), result.shape[0] * result.shape[1], 100.0 * result.getnnz() / (result.shape[0] * result.shape[1])))

        else:
            result = numpy.empty((numterms, len(docs)), dtype = numpy.float32)
            for i in xrange(len(docs)):
                if (i + 1) % 10000 == 0:
                    logging.debug(" progress: vec#%i" % i)
                vec = docs[i].getFull(numterms)
                sm = 1.0 * numpy.sum(vec)
                if sm == 0:
                    result[:, i] = 0.0 # HACK: don't scale zero vectors (INF bad?)
                else:
                    result[:, i] = vec / sm

            idfs = self.getIDFs(result)
            self.setIDFs(idfs)

            for iterm in xrange(result.shape[0]): # multiply by IDF
                if (iterm + 1) % 10000 == 0:
                    logging.debug(" progress: idf#%i" % iterm)
                result[iterm, :] *= idfs[iterm]
            # print some stats
            reverseDictionary = utils_dml.reverseMap(self.getDictionary())
            docids = self.getDocIds()
            marginal = [numpy.sum(result, axis = ax) for ax in range(2)] # marginal sums along both axes
            empty = [numpy.compress(marginal[ax] == 0, range(len(marginal[ax]))) for ax in range(2)] # indices of empty columns/rows
            logging.debug("%i empty TFIDF document vectors: " % len(empty[0]) + str(zip(empty[0], map(docids.__getitem__, empty[0]))))
            logging.debug("%i empty TFIDF term vectors: " % len(empty[1])  + str(zip(empty[1], map(reverseDictionary.get, empty[1]))))
            zeroElems = len(result.nonzero()[0])
            nearZeroElems = len(result.compress(result.flat >= 1e-3))
            logging.info("TFIDF sparsity: %i/%i = %.3f%% (%.3f%% < 1e-3)" % (zeroElems, result.size, 100.0 * zeroElems / result.size, 100.0 * nearZeroElems / result.size))
        return result
    coll.addDocument(df.createDocument("minors graph eps trees system computer survey user human time interface response.", "full_doc"))
    if not coll.docExists('m1'):
        coll.addDocument(df.createDocument(texts['m1'], 'brekeke'))
        
    coll.createDictionary()
    
    for doc in coll.getDocs():
        print "dc1: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), doc.getTokenIds()
    print coll.getDictionary()
    
    mat = coll.getBOWMatrix()
    dfm = coll.getDocFreqMap()
    stopList = ['a','and','of','the',':', 'totallyirrelevant'] # fixed stoplist
    stopIds = utils_dml.text2vect(stopList, coll.getDictionary())
    stopIds.extend(coll.freqRange(dfm, 0, 2)) # extend stopIds with ids that have 0 <= document frequency < 2
    print 'stoplist = ', map(utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds)

    for doc in coll.getDocs():
        print "before filter: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
        
    coll.filterIds(stopIds) # remove unwanted tokens
    
    for doc in coll.getDocs():
        print "after filter, before rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
        
    coll.rebuildDictionary()
    for doc in coll.getDocs():
        print "after rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())
        
    coll.filterExtremes(1)
Beispiel #9
0
    def getTFIDFMatrix(self, sparse=False):
        """construct tf*idf collection matrix. """

        docs = self.getDocs()
        dictionary = self.getDictionary()
        if not dictionary:
            raise Exception(
                "cannot build matrix from an empty collection; chances are something went wrong!"
            )
        numterms = max(dictionary.values()) + 1

        logging.info("constructing %ix%i TFIDF matrix, sparse = %s" %
                     (numterms, len(docs), sparse))

        if sparse:
            result = scipy.sparse.lil_matrix((numterms, len(docs)),
                                             dtype=numpy.float32)
            for i in xrange(len(docs)):
                if (i + 1) % 1000 == 0:
                    logging.info(" progress: at vector #%i/%i" %
                                 (i, len(docs)))
                vec, total = utils_dml.vect2bow_sorted(docs[i].getTokenIds())
                if total > 0:
                    itotal = 1.0 / total
                    for id, idfreq in vec:
                        result.rows[id].append(
                            i
                        )  # vec must be sorted by id in order for this to work!
                        result.data[id].append(itotal * idfreq)
            nnz = numpy.array(
                [len(result.data[j]) for j in xrange(result.shape[0])])
            idfs = numpy.log((1.0 * result.shape[1]) / nnz) / numpy.log(2)
            idfs[numpy.isinf(
                idfs)] = 0.0  # HAX replace INFs with 0.0 - is this ok?
            self.setIDFs(idfs)
            for iterm in xrange(result.shape[0]):  # multiply by IDF
                if (iterm + 1) % 10000 == 0:
                    logging.info(" progress: idf#%i" % iterm)
                result.data[iterm] = [
                    val * idfs[iterm] for val in result.data[iterm]
                ]
            result = result.tocsc()
            logging.info("TFIDF sparsity: %i/%i = %.3f%%" %
                         (result.getnnz(), result.shape[0] * result.shape[1],
                          100.0 * result.getnnz() /
                          (result.shape[0] * result.shape[1])))

        else:
            result = numpy.empty((numterms, len(docs)), dtype=numpy.float32)
            for i in xrange(len(docs)):
                if (i + 1) % 10000 == 0:
                    logging.debug(" progress: vec#%i" % i)
                vec = docs[i].getFull(numterms)
                sm = 1.0 * numpy.sum(vec)
                if sm == 0:
                    result[:,
                           i] = 0.0  # HACK: don't scale zero vectors (INF bad?)
                else:
                    result[:, i] = vec / sm

            idfs = self.getIDFs(result)
            self.setIDFs(idfs)

            for iterm in xrange(result.shape[0]):  # multiply by IDF
                if (iterm + 1) % 10000 == 0:
                    logging.debug(" progress: idf#%i" % iterm)
                result[iterm, :] *= idfs[iterm]
            # print some stats
            reverseDictionary = utils_dml.reverseMap(self.getDictionary())
            docids = self.getDocIds()
            marginal = [numpy.sum(result, axis=ax)
                        for ax in range(2)]  # marginal sums along both axes
            empty = [
                numpy.compress(marginal[ax] == 0, range(len(marginal[ax])))
                for ax in range(2)
            ]  # indices of empty columns/rows
            logging.debug(
                "%i empty TFIDF document vectors: " % len(empty[0]) +
                str(zip(empty[0], map(docids.__getitem__, empty[0]))))
            logging.debug(
                "%i empty TFIDF term vectors: " % len(empty[1]) +
                str(zip(empty[1], map(reverseDictionary.get, empty[1]))))
            zeroElems = len(result.nonzero()[0])
            nearZeroElems = len(result.compress(result.flat >= 1e-3))
            logging.info("TFIDF sparsity: %i/%i = %.3f%% (%.3f%% < 1e-3)" %
                         (zeroElems, result.size, 100.0 * zeroElems /
                          result.size, 100.0 * nearZeroElems / result.size))
        return result
Beispiel #10
0
    for doc in coll.getDocs():
        print "dc1: %s (%i):" % (doc.getId(), len(
            doc.getTokenIds())), doc.getTokenIds()
    print coll.getDictionary()

    mat = coll.getBOWMatrix()
    dfm = coll.getDocFreqMap()
    stopList = ['a', 'and', 'of', 'the', ':',
                'totallyirrelevant']  # fixed stoplist
    stopIds = utils_dml.text2vect(stopList, coll.getDictionary())
    stopIds.extend(coll.freqRange(
        dfm, 0,
        2))  # extend stopIds with ids that have 0 <= document frequency < 2
    print 'stoplist = ', map(
        utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds)

    for doc in coll.getDocs():
        print "before filter: %s (%i):" % (doc.getId(), len(
            doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens())

    coll.filterIds(stopIds)  # remove unwanted tokens

    for doc in coll.getDocs():
        print "after filter, before rebuild: %s (%i):" % (
            doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(),
                                                      doc.getTokens())

    coll.rebuildDictionary()
    for doc in coll.getDocs():
        print "after rebuild: %s (%i):" % (doc.getId(), len(