def main(argv): from minds import proxy proxy.init('') print if len(argv) <= 1: print __doc__ sys.exit(-1) option = argv[1] if option == '-q': print 'getQueueStatus numIndexed %s numQueued %s' % getQueueStatus() elif option == '-t': logdir = cfg.getPath('logs') qlogs = _getQueuedLogs(logdir) transformed, discarded = TransformProcess().run(logdir, qlogs) print transformed, discarded elif option == '-i': logdir = cfg.getPath('logs') qtxts = _getQueuedText(logdir) indexed, discarded = IndexProcess().run(logdir, qtxts) print indexed, discarded elif option == '-b': messagelog.mlog.lastRequest = datetime.datetime(1990,1,1) # enable _shouldTransform result = backgroundIndexTask() print result
def getQueueStatus(): """ Return the number of docs indexed and number of docs queued. """ from minds import lucene_logic global totalIndexed if totalIndexed < 0: dbindex = cfg.getPath('archiveindex') reader = lucene_logic.Reader(dbindex) totalIndexed = reader.reader. numDocs() reader.close() logdir = cfg.getPath('logs') numQueued = len(_getQueuedText(logdir)) + len(_getQueuedLogs(logdir)) return totalIndexed, numQueued
def _open(self): from minds import lucene_logic dbindex = cfg.getPath('archiveindex') self.writer = lucene_logic.Writer(dbindex) self.searcher = lucene_logic.Searcher(pathname=dbindex)
def _save(self, filename): pathname = os.path.join(cfg.getPath('logs'), filename) tmppathname = pathname+'.tmp' fp = file(tmppathname,'wb') fp.write(self.buf.getvalue()) fp.close() # write to a tmp file first and then rename to make it more atomic. os.rename(tmppathname, pathname)
def parseId(id): """ Return arc_path, filename represents by id. e.g. id=123456789 -> $archive/123456.zip/789 Raises KeyError if malformed """ if not id.isdigit() or len(id) != 9: raise KeyError, 'Invalid id: %s' % str(id) apath = cfg.getPath('archive') return os.path.join(apath, id[:6]+'.zip'), id[6:]
def _findIdRange(self): """ Scan the $archive directory for zip files for the begin and end id. """ apath = cfg.getPath('archive') files = filter(self.arc_pattern.match, os.listdir(apath)) if not files: self._beginId = 0 self._endId = 0 return first_arc = min(files) last_arc = max(files) first = self._findId(os.path.join(apath, first_arc), min) last = self._findId(os.path.join(apath, last_arc ), max) self._beginId = int(first_arc[:6] + first) # would be a 9 digit id self._endId = int(last_arc[:6] + last )+1 # would be a 9 digit id
def backgroundIndexTask(forceIndex=False): """ This is the main task of qmsg_processor. The tasks has two phrases. I. Transform phrase Parse *.qlog Filtered out unwanted docs Transform into *.qtxt Add into archive Suspense this process when user access proxy. II. Index phrase Add *.qtxt into index Optimize During optimize, block out searching. (12/03/04 note: Due to GIL and PyLucene implementation, it will actually block out every thing, including proxy.) Returns transformed, index, discarded """ interval= cfg.getint('indexing','interval',3) logdir = cfg.getPath('logs') now = datetime.datetime.now() transformed = 0 discarded_t = 0 indexed = 0 discarded_i = 0 qlogs = _getQueuedLogs(logdir) if forceIndex or _shouldTransform(now, interval): transformed, discarded_t = TransformProcess().run(logdir, qlogs) qtxts = _getQueuedText(logdir) if forceIndex or \ (_shouldTransform(now, interval) and _shouldIndex(now, logdir, qtxts)): # first check is if there is new activity indexed, discarded_i = IndexProcess().run(logdir, qtxts) return transformed, indexed, discarded_t + discarded_i
def main(argv): if len(argv) < 2: print __doc__ sys.exit(-1) from minds import proxy proxy.init(proxy.CONFIG_FILENAME) index_path = argv[1] shutil.rmtree(index_path, True) starttime = datetime.datetime.now() dbdoc = cfg.getPath('archive') idc = docarchive.idCounter idc._findIdRange() beginId = idc.beginId endId = idc.endId print 'Reindex %s(#%d-%d) -> %s' % (dbdoc, beginId, endId, index_path) reindex(dbdoc, beginId, endId, index_path) print 'Reindex finished:', datetime.datetime.now() - starttime
def forwardTmpl(wfile, env, tmpl, renderMod, *args): # e.g. SCRIPT_NAME='/admin/snoop.py', tmpl='tmpl/home.html' scriptname = env.get("SCRIPT_NAME", "") # '/admin/snoop.py' scriptpath, scriptfile = os.path.split(scriptname.lstrip("/")) # 'admin', 'snoop' tmplPathname = os.path.join(cfg.getPath("docBase"), tmpl) # invoke tmpl's render() method fp = file(tmplPathname) ### mod = importModuleByPath(tmplPathname) # reloading good for development time try: reload(renderMod) except: # todo: HACK HACK reload does not work in py2exe service version. But it is OK not to reload. pass template = HTMLTemplate.Template(renderMod.render, fp.read()) wfile.write(template.render(*args))
def openDomainFp(*args): """ open the domain data file """ path = cfg.getPath('logs') filename = os.path.join(path,DOMAINFILE) return file(filename,*args)