def main(argv): if len(argv) <= 1: print __doc__ sys.exit(-1) option = argv[1] if option == '-q': print 'getQueueStatus numIndexed %s date %s numQueued %s' % getQueueStatus() elif option == '-t': logpath = cfg.getpath('logs') qlogs = _getQueuedLogs(logpath) transformed, discarded = TransformProcess().run(logpath, qlogs) print transformed, discarded elif option == '-i': logpath = cfg.getpath('logs') qtxts = _getQueuedText(logpath) indexed, discarded = IndexProcess().run(logpath, qtxts) print indexed, discarded elif option == '-b': messagelog.mlog.lastRequest = datetime.datetime(1990,1,1) # enable _shouldTransform result = backgroundIndexTask() print result
def __init__(self): self.lock = threading.RLock() self.pathname = cfg.getpath("weblib") / self.DEFAULT_FILENAME self.wlib = weblib.WebLibrary(self) self.writer = None self.reset()
def search(query, start, end): # search indexpath = cfg.getpath('archiveindex') searcher = lucene_logic.Searcher(pathname=indexpath) query = query.rewrite(searcher.reader.reader) hits = searcher.search(query) hitList = sortHits(hits, searcher.reader.maxDoc()+2000) # prepare for highlighter formatter = SimpleHTMLFormatter("<span class='highlight'>", "</span>") highlighter = Highlighter( formatter, QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(50)) analyzer = StandardAnalyzer() # build a MatchItem list result = [] for i in xrange(start,end): if i >= hits.length(): break item = MatchItem(hitList, i) try: item.highlight(analyzer, highlighter) except Exception, e: log.exception('Error highlighting %s' % item); #item.explaination = str(searcher.explain(query, item.id)) result.append(item)
def main(argv): """ Helper to show structure of template """ filename = argv[1] pathname = cfg.getpath('docBase')/filename print print 'File:', pathname print 'Date:',str(datetime.datetime.now())[:19] fp = file(pathname,'rb') template = HTMLTemplate.Template(None, fp.read()) print template.structure()
def parseId(id): """ Return arc_path, filename represents by id. e.g. id=123456789 -> $archive/123456.zip/789 Raises KeyError if malformed """ if not id.isdigit() or len(id) != 9: raise KeyError, 'Invalid id: %s' % str(id) arc_path = cfg.getpath('archive') / id[:6]+'.zip' return arc_path, id[6:]
def getQueueStatus(): """ Return the number of docs indexed and number of docs queued. """ from minds import lucene_logic global totalIndexed, archive_date if totalIndexed < 0: indexpath = cfg.getpath('archiveindex') reader = lucene_logic.Reader(indexpath) totalIndexed = reader.reader.numDocs() # find out archive_date from the first 10 document for i in range(1,min(11,totalIndexed)): doc = reader.reader.document(i) d = doc.get('date') if d: archive_date = d break reader.close() logpath = cfg.getpath('logs') numQueued = len(_getQueuedText(logpath)) + len(_getQueuedLogs(logpath)) return totalIndexed, archive_date, numQueued
def forwardTmpl(wfile, env, tmpl, renderMod, *args): # e.g. SCRIPT_NAME='/admin/snoop.py', tmpl='tmpl/home.html' #scriptname = env.get('SCRIPT_NAME','') # '/admin/snoop.py' #scriptpath, scriptfile = os.path.split(scriptname.lstrip('/')) # 'admin', 'snoop' # invoke tmpl's render() method fp = file(cfg.getpath('docBase') / tmpl) template = HTMLTemplate.Template(renderMod.render, fp.read()) wfile.write(template.render(*args))
def main(): import PyLucene setup() # log some system info platform = sys.platform if 'win32' in sys.platform: platform += str(sys.getwindowsversion()) log.info('-'*70) log.info('%s %s', cfg.application_name, cfg.get('version.number')) log.info('Python %s', sys.version) log.info(' Platform %s', platform) log.info(' pwd: %s, defaultencoding: %s', os.getcwd(), sys.getdefaultencoding()) log.info('PyLucene %s Lucene %s LOCK_DIR %s', PyLucene.VERSION, PyLucene.LUCENE_VERSION, PyLucene.FSDirectory.LOCK_DIR) # show index version import lucene_logic dbindex = cfg.getpath('archiveindex') reader = lucene_logic.Reader(pathname=dbindex) version = reader.getVersion() reader.close() log.info(' Index version %s', version) proxyThread = threading.Thread(target=proxyMain, name='proxy') #proxyThread.setDaemon(True) proxyThread.start() adminThread = PyLucene.Thread(runnable(adminMain)) #adminThread.setDaemon(True) adminThread.start() # time.sleep(3) indexThread = PyLucene.Thread(runnable(indexMain)) indexThread.start() # main thread sleep _shutdownEvent.wait() # shutdown indexThread.join() log.fatal('indexThread terminated.') adminThread.join() log.fatal('adminThread terminated.') proxyThread.join() log.fatal('proxyThread terminated.') log.fatal('End of main thread.')
def output(self, *args): # generates the content first self.content_text = self.template.render(*args) self.style_block, self.script_block, self.content_text = _split_style_script_block(self.content_text) # render the layout frame; insert content inside tpath = cfg.getpath('docBase')/self.LAYOUT_TMPL fp = tpath.open('rb') try: tmpl = fp.read() finally: fp.close() layoutTemplate = HTMLTemplate.Template(self.render_layout, tmpl) output = layoutTemplate.render() self.out.write(output)
def doSnapshot(wfile, form, str_rid, item): url = form.getfirst('url') if not url and item: url = item.url shot = snapshot.Snapshot() shot.fetch(url) spath = cfg.getpath('weblibsnapshot')/('%s.mhtml' % str_rid) fp = spath.open('wb') try: shot.generate(fp) finally: fp.close() if item: t = datetime.datetime.now() item.cached = str(t)[:10] response.redirect(wfile, '../snapshotFrame')
def doShowSnapshot(wfile, rid, rid_path): # the rid_path are really for user's information only. # rid alone determines where to go. wlib = store.getWeblib() item = wlib.webpages.getById(rid) if not item: wfile.write('404 not found\r\n\r\n%s not found' % rid) return filename = rid == -1 and '_.mhtml' or '%s.mhtml' % rid # TODO: check file exist, move to weblib? getSnapshotFile()? fp = (cfg.getpath('weblibsnapshot')/filename).open('rb') obj = mhtml.LoadedWebArchive.load_fp(fp) # do visit? # wlib.visit(item) response.redirect(wfile, obj.root_uri)
def main(argv): if len(argv) < 2: print __doc__ sys.exit(-1) index_path = argv[1] shutil.rmtree(index_path, True) starttime = datetime.datetime.now() apath = cfg.getpath('archive') idc = docarchive.idCounter idc._findIdRange() beginId = idc._beginId endId = idc._endId print 'Reindex %s(#%d-%d) -> %s' % (apath, beginId, endId, index_path) reindex(apath, beginId, endId, index_path) print 'Reindex finished:', datetime.datetime.now() - starttime
def _findIdRange(self): """ Scan the $archive directory for zip files for the begin and end id. """ apath = cfg.getpath('archive') files = fileutil.listdir(apath, self.arc_pattern) if not files: self._beginId = 0 self._endId = 0 return first_arc = min(files) last_arc = max(files) first = self._findId(apath/first_arc, min) last = self._findId(apath/last_arc, max) self._beginId = int(first_arc[:6] + first) # would be a 9 digit id self._endId = int(last_arc[:6] + last )+1 # would be a 9 digit id
def backgroundIndexTask(forceIndex=False): """ This is the main task of qmsg_processor. The tasks has two phrases. I. Transform phrase Parse *.qlog Filtered out unwanted docs Transform into *.qtxt Add into archive Suspense this process when user access proxy. II. Index phrase Add *.qtxt into index Optimize During optimize, block out searching. (12/03/04 note: Due to GIL and PyLucene implementation, it will actually block out every thing, including proxy.) Returns transformed, index, discarded """ interval= cfg.getint('indexing.interval',3) logpath = cfg.getpath('logs') now = datetime.datetime.now() transformed = 0 discarded_t = 0 indexed = 0 discarded_i = 0 qlogs = _getQueuedLogs(logpath) if forceIndex or _shouldTransform(now, interval): transformed, discarded_t = TransformProcess().run(logpath, qlogs) qtxts = _getQueuedText(logpath) if forceIndex or \ (_shouldTransform(now, interval) and _shouldIndex(now, logpath, qtxts)): # first check is if there is new activity indexed, discarded_i = IndexProcess().run(logpath, qtxts) return transformed, indexed, discarded_t + discarded_i
def __init__(self, wfile, content_type='text/html', encoding='utf-8', cache_control='no-cache'): self.wfile = wfile self.content_type = content_type self.encoding = encoding self.cache_control = cache_control self.cookie = Cookie.SimpleCookie() # load template tpath = cfg.getpath('docBase')/self.TEMPLATE_FILE fp = tpath.open('rb') try: self.template = HTMLTemplate.Template(self.render, fp.read()) finally: fp.close()
def testSafeConfig(self): # make sure we are using safe test config keys = [n for n,v in cfg.cparser.items('path')] # take these items outside of test keys.remove('docbase') keys.remove('testdoc') # check that the above code do what we want self.assert_('data' in keys) self.assert_('logs' in keys) self.assert_('weblibsnapshot' in keys) self.assert_('archiveindex' in keys) for name in keys: self.assert_('test' in cfg.getpath(name)) # we get test path even if we import from config from minds.config import cfg as config_cfg for name in keys: self.assert_('test' in config_cfg.getpath(name))
def __init__(self, wfile, content_type='text/html', encoding='utf-8', cache_control='no-cache'): # load template tpath = cfg.getpath('docBase')/self.TEMPLATE_FILE fp = tpath.open('rb') try: self.template = HTMLTemplate.Template(self.render, fp.read()) finally: fp.close() # HTTP header wfile.write('Content-type: %s; charset=%s\r\n' % (content_type, encoding)) if cache_control: wfile.write('Cache-control: %s\r\n' % (cache_control,)) wfile.write('\r\n') # build encoded output stream self.out = codecs.getwriter(encoding)(wfile,'replace')
def setupLogging(): # remove any bootstrap log handler installed rootlog = logging.getLogger() map(rootlog.removeHandler, rootlog.handlers) syslogpath = cfg.getpath('logs')/'system.log' hdlr = logging.handlers.RotatingFileHandler(syslogpath, 'a', 1100000, 4) formatter = logging.Formatter('%(asctime)s %(name)-10s - %(message)s') hdlr.setFormatter(formatter) # work around [python-Bugs-1314519] logging run into deadlock in some error handling situation # https://sourceforge.net/tracker/?func=detail&atid=105470&aid=1314519&group_id=5470 hdlr.lock = threading.RLock() rootlog.addHandler(hdlr) rootlog.setLevel(logging.DEBUG) # redirect stdout and stderr to log sys.stdout = LogFileObj(logging.getLogger('stdout')) sys.stderr = LogFileObj(logging.getLogger('stderr')) print 'stdout ready' print >>sys.stderr, 'stderr ready'
def _get_snapshot_content(self, item): # TODO: refactor filename = item.id == -1 and '_.mhtml' or '%s.mhtml' % item.id spath = cfg.getpath('weblibsnapshot')/filename if not spath.exists(): return '' fp = spath.open('rb') # TODO: check file exist, move to weblib? getSnapshotFile()? lwa = mhtml.LoadedWebArchive(fp) resp = lwa.fetch_uri(lwa.root_uri) if not resp: return '' # TODO: lucene_logic: use to docid is confusing with lucene's internal docid? # TODO: mind content-type, encoding, framed objects?? data = resp.read() meta = {} contentBuf = StringIO.StringIO() result = distillML.distill(resp, contentBuf, meta=meta) contentBuf.seek(0) # TODO: what's the deal with writeHeader? meta, content = distillparse.parseDistillML(contentBuf, writeHeader=None) return content
def _open(self): from minds import lucene_logic indexpath = cfg.getpath('archiveindex') self.writer = lucene_logic.Writer(indexpath) self.searcher = lucene_logic.Searcher(pathname=indexpath)
def openDomainFp(*args): """ open the domain data file """ filename = cfg.getpath('logs')/DOMAINFILE return file(filename,*args)
def init_index(self): from minds import lucene_logic wpath = cfg.getpath('weblibindex') self.index_writer = lucene_logic.Writer(wpath) self.index_reader = lucene_logic.Reader(wpath) self.index_searcher = lucene_logic.Searcher(pathname=wpath)