Esempio n. 1
0
 def __init__(self):
     self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
     self.termindex = {}
     self.sdbindex = SimpleDBIndex()
     self.PAGESEPARATOR = """&""" # safe since only numbers used
     self.POSITIONSEPARATOR="""|""" # safe since only numbers separating
     self.CLEANREGEXP = re.compile(r"(\(|\)|\,|\:|\;)")
Esempio n. 2
0
 def setUp(self):
     self.sdbindex = SimpleDBIndex(domainprefix="atbroxtest")
Esempio n. 3
0
class TestSimpleDBIndex(unittest.TestCase):
    def setUp(self):
        self.sdbindex = SimpleDBIndex(domainprefix="atbroxtest")
    
    def testGetTermLine(self):
        i = 1+13*self.sdbindex.MAXITEMSIZE
        origterm = "gettermline"
        term, termLine = self.sdbindex._getTermLine(origterm,i)
        self.assertEquals(origterm + self.sdbindex.ITEMINDEXSEPARATOR + str(13), term)
        
    def testTermLineSize(self):
        termLine = {"termlinesize":"456"}
        self.assertEquals(15, self.sdbindex._termLineSize(termLine))
        
    def testFlushCache(self):
        when(self.sdbindex.sdb).batch_put_attributes(any(str),any()).thenReturn("")

        cacheentry = ("testflushcache", {"1":"abc", "2":"def"})
        self.sdbindex.batchcache.append(cacheentry)
        self.sdbindex._flushcache()
        self.assertEquals(None, verify(self.sdbindex.sdb, times=1).batch_put_attributes(any(str),any()))
        
    def testStore(self):
        when(self.sdbindex.sdb).batch_put_attributes(any(str),any()).thenReturn("")
        term = "teststoreterm"
        # trigger autoflushing
        termLine = {"0":"a"*(self.sdbindex.MAXITEMSIZE/10)} 
        # and a forced flush with flush=True
        self.sdbindex._store(term,termLine, flush = True)
        # check that it was flushed twice
        self.assertEquals(None, 
                          verify(self.sdbindex.sdb, times=1).batch_put_attributes(any(str),any()))
        
    def testAddInvertedFileEntry(self):
        self.sdbindex._warningthisdeletesallsimpledbdomains()
        self.sdbindex._createsimpledbdomains()
        term = "termtoput"
        vector = "thistermvectorshouldbeputandstored"
        self.sdbindex.addInvertedFileEntry(term, vector)
        self.assertTrue("termtoput" in self.sdbindex.domain.keys())
        self.assertEquals(1, len(self.sdbindex.domain.keys()))
        #print >> sys.stderr, self.sdbindex.domain.keys()
        
    def testGetInvertedFileEntry(self):
        self.sdbindex._warningthisdeletesallsimpledbdomains()
        self.sdbindex._createsimpledbdomains()
        term = "termtoget"
        putvector = "valuetoget"
        self.sdbindex.addInvertedFileEntry(term, putvector)
        self.sdbindex._flushcache()
        time.sleep(4)
        getvector = self.sdbindex.getInvertedFileEntry(term)
        #print >> sys.stderr, "getvector = ", getvector
        self.assertEquals(getvector, putvector)
        
        
    def testAddAndHashUrl(self):
        self.sdbindex._warningthisdeletesallsimpledbdomains()
        self.sdbindex._createsimpledbdomains()
        url = "http://atbrox.com"
        urlhash = self.sdbindex.addAndHashUrl(url)
        self.assertEquals(str(url.__hash__()), urlhash)
        
    def testGetUrl(self):
        self.sdbindex._warningthisdeletesallsimpledbdomains()
        self.sdbindex._createsimpledbdomains()
        url = "http://www.atbrox.com"
        urlhash = self.sdbindex.addAndHashUrl(url)
        self.assertEquals(url, self.sdbindex.getUrl(urlhash))
Esempio n. 4
0
class SimpleDBSearch:
    def __init__(self):
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.termindex = {}
        self.sdbindex = SimpleDBIndex()
        self.PAGESEPARATOR = """&""" # safe since only numbers used
        self.POSITIONSEPARATOR="""|""" # safe since only numbers separating
        self.CLEANREGEXP = re.compile(r"(\(|\)|\,|\:|\;)")
        
    def _getSentences(self, document):
        return self.sent_detector.tokenize(document)
    
    def _getAllTermsInOrder(self, document):
        allterms = []
        sentences = self._getSentences(document)
        for sentence in sentences:
            allterms += sentence.split()
        return allterms       
    
    def _getTermsWithPositions(self, document):
        allterms = self._getAllTermsInOrder(document)
        termswithpos = {}
        for pos, term in enumerate(allterms):
            indexterm = self.CLEANREGEXP.sub("", term).lower().strip()
            indexterm.lower().strip()
            if indexterm != "":
                termswithpos[indexterm] = termswithpos.get(indexterm,[]) + [str(pos)]
        return termswithpos
                      
    def index(self, url, document):
        urlhash = self.sdbindex.addAndHashUrl(url)
        termswithpositions = self._getTermsWithPositions(document)
        for term in termswithpositions:
            # first item is the hash of the url, rest is 
            self.termindex[term] = self.termindex.get(term,"")
            self.termindex[term] += self.PAGESEPARATOR
            self.termindex[term] += self.POSITIONSEPARATOR.join([urlhash] + termswithpositions[term])
        return self.termindex
            
    def writeIndexToSDB(self):
        i = 0
        numterms = len(self.termindex.keys())
        for term in self.termindex:
            print i, " of ", numterms,  " adding tv for term: '%s' to SDB" % (term)
            self.sdbindex.addInvertedFileEntry(term, self.termindex[term])
            i += 1
        print "flushing cache"
        self.sdbindex._flushcache()
            
    def extractUrlHashListFromInvertedFileEntry(self, invertedFileEntry):
        pages = invertedFileEntry.split(self.PAGESEPARATOR)
        # TODO: extract positions
        urlhashlist = []
        for page in pages:
            if page == "":
                continue
            urlhashlist += [page.split(self.POSITIONSEPARATOR)[0]]
        urlhashlist.sort()
        return urlhashlist
        
    def query(self, query):
        urlhashforterms = {}
        terms = query.split()
        for term in terms:
            t0 = time.time()
            invertedFileEntry = self.sdbindex.getInvertedFileEntry(term)
            t1 = time.time()
            print "fetchtime|%f" % (t1-t0)
            #print invertedFileEntry
            urlhashes = self.extractUrlHashListFromInvertedFileEntry(invertedFileEntry)
            #print urlhashes
            for urlhash in urlhashes:
                urlhashforterms[urlhash] = urlhashforterms.get(urlhash, []) + [term]
                
        results = ((len(urlhashforterms[urlhash]), urlhash) for urlhash in urlhashforterms)
        #print "raw inverted file vector results: "
        #print results
        #print "urls with matches"
        results = []
        for matches, urlhash in results:
            # TODO: look up url with urlhash
            #print self.sdbindex.getUrl(urlhash)
            results.append(self.sdbindx.getUrl(urlhash))
        return results