def create_index(root): index_root = os.path.join(root, config.IndexPath) if not os.path.exists(index_root): os.makedirs(index_root, config.DirectoryDefaultMode) docid_file = open(os.path.join(index_root, 'docid'), 'wb') docid_idx = open(os.path.join(index_root, 'docid.idx'), 'wb') dictionary = {} index = [] lexid = 0 docid = 0 root = os.path.abspath(root) for parent, dirs, files in os.walk(root): for name in files: if name.lower().endswith('.epub'): full_name = os.path.join(parent, name) try: info = epub.Info(full_name) except Exception, ex: print('error while parsing file:', full_name, ex) continue docid_idx.write(struct.pack('i', docid_file.tell())) docid_file.write(full_name.encode(config.FileCodePage)) lex = lexems.get(' '.join(info.authors())) lex += lexems.get(' '.join(info.titles())) for w in lex: if not dictionary.has_key(w): dictionary[w] = lexid lexid += 1 index.append([]) if len(index[dictionary[w]]) == 0 or index[dictionary[w]][-1] != docid: index[dictionary[w]].append(docid) docid += 1
def create_index(root): index_root = os.path.join(root, config.IndexPath) if not os.path.exists(index_root): os.makedirs(index_root, config.DirectoryDefaultMode) docid_file = open(os.path.join(index_root, 'docid'), 'wb') docid_idx = open(os.path.join(index_root, 'docid.idx'), 'wb') dictionary = {} index = [] lexid = 0 docid = 0 root = os.path.abspath(root) for parent, dirs, files in os.walk(root): for name in files: if name.lower().endswith('.epub'): full_name = os.path.join(parent, name) try: info = epub.Info(full_name) except Exception, ex: print('error while parsing file:', full_name, ex) continue docid_idx.write(struct.pack('i', docid_file.tell())) docid_file.write(full_name.encode(config.FileCodePage)) lex = lexems.get(' '.join(info.authors())) lex += lexems.get(' '.join(info.titles())) for w in lex: if not dictionary.has_key(w): dictionary[w] = lexid lexid += 1 index.append([]) if len(index[dictionary[w]] ) == 0 or index[dictionary[w]][-1] != docid: index[dictionary[w]].append(docid) docid += 1
def search(keywords, idx): def size_cmp(lhs, rhs): return len(lhs) - len(rhs) keywords = lexems.get(keywords) docs = [] for keyword in keywords: docs.append(idx.docids(keyword.lower())) if len(docs) == 0: return docs docs.sort(size_cmp) docids = docs[0] for i in xrange(1, len(docs)): docids = intersect(docids, docs[i]) return docids