def buildMscPureMatrix(lang): if os.path.exists(ARTS_FILE): logging.warning( "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?" % ARTS_FILE) else: logging.info("creating MSC database from meta.xml files") createMscsDb( ) # only run this when collection changes (test for file existence / delete file explicitly?) arts = [ art for art in docsim.getArts( ARTS_FILE, acceptNoBody=True, acceptNoMsc=False) if art.language == lang or lang == 'any' ] cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False))) rcats = utils_dml.reverseMap(cats) saveMsc2Id(cats, lang) logging.info("building MSC binary matrix") resultBin = numpy.zeros( (len(cats), len(cats)), dtype=numpy.float32 ) # binary msc similarity (with fixed msc hierarchy = identity) for idi, cati in cats.iteritems(): for idj, catj in cats.iteritems(): # print idi, cati, idj, catj if idi == idj: resultBin[idi, idj] = 1.0 else: resultBin[idi, idj] = 0.0 matutils.saveMatrix(resultBin, common.matrixFile("mscs_bin_%s.mm" % lang), sparse=False) return resultBin
def getBOWMatrix(self): # FIXME not sparse atm; scipy.sparse is crap """return collection as sparse term-by-document matrix (index order is array[term,document]=frequency)""" dictionary = self.getDictionary() docs = self.getDocs() numterms = max(dictionary.values()) + 1 logging.info("constructing %ix%i BOW matrix" % (numterms, len(docs))) # build the matrix result = numpy.empty((numterms, len(docs)), dtype = int) for i in xrange(len(docs)): result[:, i] = docs[i].getFull(length = numterms) ## result = numpy.column_stack([doc.getFull() for doc in self.getDocs()]) # print some stats reverseDictionary = utils_dml.reverseMap(dictionary) docids = self.getDocIds() marginal = [numpy.sum(result, axis = ax) for ax in range(2)] # marginal sums along either axis empty = [numpy.compress(marginal[ax] == 0, range(len(marginal[ax]))) for ax in range(2)] # indices of empty columns/rows logging.info("%i empty BOW document vectors: " % len(empty[0]) + str(zip(empty[0], map(docids.__getitem__, empty[0])))) logging.info("%i empty BOW term vectors:" % len(empty[1]) + str(zip(empty[1], map(reverseDictionary.get, empty[1])))) zeroElems = len(result.nonzero()[0]) logging.info("BOW sparsity: %i/%i = %.3f%%" % (zeroElems, result.size, 100.0 * zeroElems / result.size)) return result
def getBOWMatrix(self): # FIXME not sparse atm; scipy.sparse is crap """return collection as sparse term-by-document matrix (index order is array[term,document]=frequency)""" dictionary = self.getDictionary() docs = self.getDocs() numterms = max(dictionary.values()) + 1 logging.info("constructing %ix%i BOW matrix" % (numterms, len(docs))) # build the matrix result = numpy.empty((numterms, len(docs)), dtype=int) for i in xrange(len(docs)): result[:, i] = docs[i].getFull(length=numterms) ## result = numpy.column_stack([doc.getFull() for doc in self.getDocs()]) # print some stats reverseDictionary = utils_dml.reverseMap(dictionary) docids = self.getDocIds() marginal = [numpy.sum(result, axis=ax) for ax in range(2)] # marginal sums along either axis empty = [ numpy.compress(marginal[ax] == 0, range(len(marginal[ax]))) for ax in range(2) ] # indices of empty columns/rows logging.info("%i empty BOW document vectors: " % len(empty[0]) + str(zip(empty[0], map(docids.__getitem__, empty[0])))) logging.info("%i empty BOW term vectors:" % len(empty[1]) + str(zip(empty[1], map(reverseDictionary.get, empty[1])))) zeroElems = len(result.nonzero()[0]) logging.info("BOW sparsity: %i/%i = %.3f%%" % (zeroElems, result.size, 100.0 * zeroElems / result.size)) return result
def buildMscPureMatrix(lang): if os.path.exists(ARTS_FILE): logging.warning( "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?" % ARTS_FILE ) else: logging.info("creating MSC database from meta.xml files") createMscsDb() # only run this when collection changes (test for file existence / delete file explicitly?) arts = [ art for art in docsim.getArts(ARTS_FILE, acceptNoBody=True, acceptNoMsc=False) if art.language == lang or lang == "any" ] cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False))) rcats = utils_dml.reverseMap(cats) saveMsc2Id(cats, lang) logging.info("building MSC binary matrix") resultBin = numpy.zeros( (len(cats), len(cats)), dtype=numpy.float32 ) # binary msc similarity (with fixed msc hierarchy = identity) for idi, cati in cats.iteritems(): for idj, catj in cats.iteritems(): # print idi, cati, idj, catj if idi == idj: resultBin[idi, idj] = 1.0 else: resultBin[idi, idj] = 0.0 matutils.saveMatrix(resultBin, common.matrixFile("mscs_bin_%s.mm" % lang), sparse=False) return resultBin
def rebuildDictionary(self): """remove entries which have no corresponding ids in the database, shrink resulting gaps""" logging.debug("rebuilding dictionary, shrinking gaps") uids = sorted(self.getUniqueTokens()) # get unique ids present in the database newids = range(len(uids)) # new ids revold = utils_dml.reverseMap(self.getDictionary()) idmap = dict(zip(uids, newids)) newdict = {} for uid in uids: # generate new dictionary newdict[revold[uid]] = idmap[uid] for doc in self.getDocs(): # update ids in documents to reflect the new dictionary doc.setTokenIds([idmap[i] for i in doc.getTokenIds()]) self.dictionary = newdict
def rebuildDictionary(self): """remove entries which have no corresponding ids in the database, shrink resulting gaps""" logging.debug("rebuilding dictionary, shrinking gaps") uids = sorted( self.getUniqueTokens()) # get unique ids present in the database newids = range(len(uids)) # new ids revold = utils_dml.reverseMap(self.getDictionary()) idmap = dict(zip(uids, newids)) newdict = {} for uid in uids: # generate new dictionary newdict[revold[uid]] = idmap[uid] for doc in self.getDocs( ): # update ids in documents to reflect the new dictionary doc.setTokenIds([idmap[i] for i in doc.getTokenIds()]) self.dictionary = newdict
def getTFIDFMatrix(self, sparse = False): """construct tf*idf collection matrix. """ docs = self.getDocs() dictionary = self.getDictionary() if not dictionary: raise Exception("cannot build matrix from an empty collection; chances are something went wrong!") numterms = max(dictionary.values()) + 1 logging.info("constructing %ix%i TFIDF matrix, sparse = %s" % (numterms, len(docs), sparse)) if sparse: result = scipy.sparse.lil_matrix((numterms, len(docs)), dtype = numpy.float32) for i in xrange(len(docs)): if (i + 1) % 1000 == 0: logging.info(" progress: at vector #%i/%i" % (i, len(docs))) vec, total = utils_dml.vect2bow_sorted(docs[i].getTokenIds()) if total > 0: itotal = 1.0 / total for id, idfreq in vec: result.rows[id].append(i) # vec must be sorted by id in order for this to work! result.data[id].append(itotal * idfreq) nnz = numpy.array([len(result.data[j]) for j in xrange(result.shape[0])]) idfs = numpy.log((1.0 * result.shape[1]) / nnz) / numpy.log(2) idfs[numpy.isinf(idfs)] = 0.0 # HAX replace INFs with 0.0 - is this ok? self.setIDFs(idfs) for iterm in xrange(result.shape[0]): # multiply by IDF if (iterm + 1) % 10000 == 0: logging.info(" progress: idf#%i" % iterm) result.data[iterm] = [val * idfs[iterm] for val in result.data[iterm]] result = result.tocsc() logging.info("TFIDF sparsity: %i/%i = %.3f%%" % (result.getnnz(), result.shape[0] * result.shape[1], 100.0 * result.getnnz() / (result.shape[0] * result.shape[1]))) else: result = numpy.empty((numterms, len(docs)), dtype = numpy.float32) for i in xrange(len(docs)): if (i + 1) % 10000 == 0: logging.debug(" progress: vec#%i" % i) vec = docs[i].getFull(numterms) sm = 1.0 * numpy.sum(vec) if sm == 0: result[:, i] = 0.0 # HACK: don't scale zero vectors (INF bad?) else: result[:, i] = vec / sm idfs = self.getIDFs(result) self.setIDFs(idfs) for iterm in xrange(result.shape[0]): # multiply by IDF if (iterm + 1) % 10000 == 0: logging.debug(" progress: idf#%i" % iterm) result[iterm, :] *= idfs[iterm] # print some stats reverseDictionary = utils_dml.reverseMap(self.getDictionary()) docids = self.getDocIds() marginal = [numpy.sum(result, axis = ax) for ax in range(2)] # marginal sums along both axes empty = [numpy.compress(marginal[ax] == 0, range(len(marginal[ax]))) for ax in range(2)] # indices of empty columns/rows logging.debug("%i empty TFIDF document vectors: " % len(empty[0]) + str(zip(empty[0], map(docids.__getitem__, empty[0])))) logging.debug("%i empty TFIDF term vectors: " % len(empty[1]) + str(zip(empty[1], map(reverseDictionary.get, empty[1])))) zeroElems = len(result.nonzero()[0]) nearZeroElems = len(result.compress(result.flat >= 1e-3)) logging.info("TFIDF sparsity: %i/%i = %.3f%% (%.3f%% < 1e-3)" % (zeroElems, result.size, 100.0 * zeroElems / result.size, 100.0 * nearZeroElems / result.size)) return result
coll.addDocument(df.createDocument("minors graph eps trees system computer survey user human time interface response.", "full_doc")) if not coll.docExists('m1'): coll.addDocument(df.createDocument(texts['m1'], 'brekeke')) coll.createDictionary() for doc in coll.getDocs(): print "dc1: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), doc.getTokenIds() print coll.getDictionary() mat = coll.getBOWMatrix() dfm = coll.getDocFreqMap() stopList = ['a','and','of','the',':', 'totallyirrelevant'] # fixed stoplist stopIds = utils_dml.text2vect(stopList, coll.getDictionary()) stopIds.extend(coll.freqRange(dfm, 0, 2)) # extend stopIds with ids that have 0 <= document frequency < 2 print 'stoplist = ', map(utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds) for doc in coll.getDocs(): print "before filter: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.filterIds(stopIds) # remove unwanted tokens for doc in coll.getDocs(): print "after filter, before rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.rebuildDictionary() for doc in coll.getDocs(): print "after rebuild: %s (%i):" % (doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.filterExtremes(1)
def getTFIDFMatrix(self, sparse=False): """construct tf*idf collection matrix. """ docs = self.getDocs() dictionary = self.getDictionary() if not dictionary: raise Exception( "cannot build matrix from an empty collection; chances are something went wrong!" ) numterms = max(dictionary.values()) + 1 logging.info("constructing %ix%i TFIDF matrix, sparse = %s" % (numterms, len(docs), sparse)) if sparse: result = scipy.sparse.lil_matrix((numterms, len(docs)), dtype=numpy.float32) for i in xrange(len(docs)): if (i + 1) % 1000 == 0: logging.info(" progress: at vector #%i/%i" % (i, len(docs))) vec, total = utils_dml.vect2bow_sorted(docs[i].getTokenIds()) if total > 0: itotal = 1.0 / total for id, idfreq in vec: result.rows[id].append( i ) # vec must be sorted by id in order for this to work! result.data[id].append(itotal * idfreq) nnz = numpy.array( [len(result.data[j]) for j in xrange(result.shape[0])]) idfs = numpy.log((1.0 * result.shape[1]) / nnz) / numpy.log(2) idfs[numpy.isinf( idfs)] = 0.0 # HAX replace INFs with 0.0 - is this ok? self.setIDFs(idfs) for iterm in xrange(result.shape[0]): # multiply by IDF if (iterm + 1) % 10000 == 0: logging.info(" progress: idf#%i" % iterm) result.data[iterm] = [ val * idfs[iterm] for val in result.data[iterm] ] result = result.tocsc() logging.info("TFIDF sparsity: %i/%i = %.3f%%" % (result.getnnz(), result.shape[0] * result.shape[1], 100.0 * result.getnnz() / (result.shape[0] * result.shape[1]))) else: result = numpy.empty((numterms, len(docs)), dtype=numpy.float32) for i in xrange(len(docs)): if (i + 1) % 10000 == 0: logging.debug(" progress: vec#%i" % i) vec = docs[i].getFull(numterms) sm = 1.0 * numpy.sum(vec) if sm == 0: result[:, i] = 0.0 # HACK: don't scale zero vectors (INF bad?) else: result[:, i] = vec / sm idfs = self.getIDFs(result) self.setIDFs(idfs) for iterm in xrange(result.shape[0]): # multiply by IDF if (iterm + 1) % 10000 == 0: logging.debug(" progress: idf#%i" % iterm) result[iterm, :] *= idfs[iterm] # print some stats reverseDictionary = utils_dml.reverseMap(self.getDictionary()) docids = self.getDocIds() marginal = [numpy.sum(result, axis=ax) for ax in range(2)] # marginal sums along both axes empty = [ numpy.compress(marginal[ax] == 0, range(len(marginal[ax]))) for ax in range(2) ] # indices of empty columns/rows logging.debug( "%i empty TFIDF document vectors: " % len(empty[0]) + str(zip(empty[0], map(docids.__getitem__, empty[0])))) logging.debug( "%i empty TFIDF term vectors: " % len(empty[1]) + str(zip(empty[1], map(reverseDictionary.get, empty[1])))) zeroElems = len(result.nonzero()[0]) nearZeroElems = len(result.compress(result.flat >= 1e-3)) logging.info("TFIDF sparsity: %i/%i = %.3f%% (%.3f%% < 1e-3)" % (zeroElems, result.size, 100.0 * zeroElems / result.size, 100.0 * nearZeroElems / result.size)) return result
for doc in coll.getDocs(): print "dc1: %s (%i):" % (doc.getId(), len( doc.getTokenIds())), doc.getTokenIds() print coll.getDictionary() mat = coll.getBOWMatrix() dfm = coll.getDocFreqMap() stopList = ['a', 'and', 'of', 'the', ':', 'totallyirrelevant'] # fixed stoplist stopIds = utils_dml.text2vect(stopList, coll.getDictionary()) stopIds.extend(coll.freqRange( dfm, 0, 2)) # extend stopIds with ids that have 0 <= document frequency < 2 print 'stoplist = ', map( utils_dml.reverseMap(coll.getDictionary()).__getitem__, stopIds) for doc in coll.getDocs(): print "before filter: %s (%i):" % (doc.getId(), len( doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.filterIds(stopIds) # remove unwanted tokens for doc in coll.getDocs(): print "after filter, before rebuild: %s (%i):" % ( doc.getId(), len(doc.getTokenIds())), zip(doc.getTokenIds(), doc.getTokens()) coll.rebuildDictionary() for doc in coll.getDocs(): print "after rebuild: %s (%i):" % (doc.getId(), len(