def write(plist, url, title, config): ''' RQ worker function which adds the given document posting list data to the inverted index. ''' MAX_DOCS = int(config.get('crawler', 'max_docs')) TERM_DICT_FILE = config.get('indexer', 'term_dict_file') DOC_LIST_FILE = config.get('indexer', 'doc_list_file') dl = DocList(DOC_LIST_FILE) if len(dl) < MAX_DOCS: did = md5(url).hexdigest() if did not in dl: dl.append(url, title) iidx = InvertedIndex(TERM_DICT_FILE, DOC_LIST_FILE) iidx.append(plist, did) iidx.update()
def write(plist, url, title, config): ''' RQ worker function which adds the given document posting list data to the inverted index. ''' MAX_DOCS = int(config.get('crawler', 'max_docs')) TERM_DICT_FILE = config.get('indexer', 'term_dict_file') DOC_LIST_FILE = config.get('indexer', 'doc_list_file') dl = DocList(DOC_LIST_FILE) if len(dl) < MAX_DOCS: did = md5(url).hexdigest() if did not in dl: dl.append(url, title) iidx = InvertedIndex( TERM_DICT_FILE, DOC_LIST_FILE ) iidx.append(plist, did) iidx.update()
from tokenizer import DocProcessor from indexer import InvertedIndex docs = { 1: '/home/ubuntu/eecs767/var/docs/doc1.html', 2: '/home/ubuntu/eecs767/var/docs/doc2.html', 3: '/home/ubuntu/eecs767/var/docs/doc3.html', 4: '/home/ubuntu/eecs767/var/docs/doc4.html', 5: '/home/ubuntu/eecs767/var/docs/doc5.html', } dproc = DocProcessor() iidx = InvertedIndex() for did, doc in docs.iteritems(): print '-- Processing Doc #%s: %s' % (did, doc) dproc.parse(doc) plist = dproc.gen_posting_list() iidx.append(plist, did) iidx.update() iidx.clear()