def index_zim_file(zim_filename, output_dir=".", links_dir=None, index_contents=True, mime_types=DEFAULT_MIME_TYPES, memory_limit=DEFAULT_MEMORY_LIMIT, processors=1, commit_period=DEFAULT_COMMIT_PERIOD, commit_limit=DEFAULT_COMMIT_LIMIT, use_progress_bar=False, **kwargs):
    zim_obj = ZimFile(zim_filename, cache_size=ZIM_CACHE_SIZE)

    logger.info("Indexing: %s" % zim_filename)

    if not index_contents:
        logger.info("Not indexing article contents")


    if links_dir != None:
        logger.debug("Loading links file")
        links_info = load_links_file(zim_filename, links_dir)
        if len(links_info) == 0:
            logger.error("No links loaded from links directory: %s" % links_dir)
    else:
        links_info = {}
        logger.warning("No links directory specified.")

    # Figure out which mime type indexes from this file we will use
    logger.debug("All mime type names: %s" % zim_obj.mimeTypeList)
    logger.info("Using mime types:")
    mime_type_indexes = []
    for mt_re in mime_types:
        for mt_idx, mt_name in enumerate(zim_obj.mimeTypeList):
            if re.search(mt_re, mt_name):
                mime_type_indexes.append(mt_idx)
                logger.info(mt_name)

    index_dir = index_directory_path(output_dir, zim_filename)
    if not os.path.exists(index_dir):
        logger.debug("Creating index directory: %s" % index_dir)
        os.mkdir(index_dir)

    # Don't overwrite an existing index
    if index.exists_in(index_dir):
        logger.debug("Loading existing index")
        ix = index.open_dir(index_dir)
        searcher = ix.searcher()
    else:
        logger.debug("Creating new index")
        ix = index.create_in(index_dir, get_schema())
        searcher = None

    writer = ix.writer(limitmb=memory_limit, procs=processors)

    num_articles = zim_obj.header['articleCount']
    if use_progress_bar:
        pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=num_articles).start()
    else:
        logger.info("Not using progress bar, will display timestamped occasional updates.")

    # Counter for when to output occasional updates
    update_count = 0
    last_update = datetime.now()
    needs_commit = False

    for idx, article_info in enumerate(article_info_as_unicode(zim_obj.articles())):
        if use_progress_bar:
            pbar.update(idx)
        else:
            now = datetime.now()
            if update_count >= commit_limit or now > (last_update + timedelta(seconds=commit_period)):
                logger.info("%s - %d/%d - %.2f%%" % (now.isoformat(), idx, num_articles, (idx / float(num_articles)) * 100.0 ))
                update_count = 0
                last_update = now

                if needs_commit:
                    writer.commit()
                    writer = ix.writer(limitmb=memory_limit, procs=processors)
                    needs_commit = False
            else:
                update_count += 1

        # Skip articles of undesired mime types
        if article_info['mimetype'] not in mime_type_indexes:
            continue

        # Protect read of existing documents as sometimes there
        # incomplete writes
        try:
            if searcher != None:
                existing = searcher.document(url=article_info['url'])
            else:
                existing = None
        except:
            logger.exception("Unexpected exception when looking for existing indexed article for index: %d" % idx)
            existing = None
        
        # Skip articles that have already been indexed
        if existing != None:
            continue

        if index_contents:
            content = content_as_text(zim_obj, article_info, idx)
            # Whoosh seems to take issue with empty content
            # and complains about it not being unicode ?!
            if content != None and len(content.strip()) == 0:
                content = None
        else:
            content = None

        # Look for forward and backwards links
        if len(links_info) > 0:
            article_links = links_info.get(article_info['index'], None)
            if article_links != None:
                article_info['reverse_links'] = article_links[0]
                article_info['forward_links'] = article_links[1]
            else:
                logger.debug("No links info found for index: %d" % idx)

        writer.add_document(content=content, **article_info)
        needs_commit = True

    if use_progress_bar:
        pbar.finish()

    logger.info("Making final commit")

    writer.commit()

    logger.info("Finished")
def verify_indexes(zim_files, index_dir_base, indexed_count_cache=None, verbose=False):

    missing_indexes = []
    empty_indexes = []
    complete_indexes = []
    incomplete_indexes = []

    # Load a dictionary from the index cache file 
    if indexed_count_cache != None:
        if os.path.exists(indexed_count_cache):
            logger.debug("Loading existing indexable count cache: %s" % indexed_count_cache)
            zim_indexable = pickle.load(open(indexed_count_cache, "rb"))
        else:
            logger.debug("Opening new indexable count cache: %s" % indexed_count_cache)
            zim_indexable = {}
    else:
        zim_indexable = None

    for zim_fn in zim_files:
        index_dir = index_directory_path(index_dir_base, zim_fn)

        logging.debug("ZIM File: %s" % zim_fn)
        logging.debug("Index Dir: %s" % index_dir)

        if not os.path.exists(index_dir):
            logging.debug("\tIndex is missing\n")
            missing_indexes.append( (zim_fn, index_dir) )
            continue

        with nested(closing(ZimFile(zim_fn)), closing(open_dir(index_dir))) as (zim_obj, ix):

            if ix.is_empty():
                logger.debug("\tIndex exists but is empty\n")
                empty_indexes.append( (zim_fn, index_dir) )
                continue

            if zim_indexable != None:
                # Try to find indexable count from cache since it takes
                # awhile to compute these and they never change
                indexed_count = zim_indexable.get(zim_fn, None)
                if indexed_count == None:
                    mime_type_indexes = []
                    for mt_re in DEFAULT_MIME_TYPES:
                        for mt_idx, mt_name in enumerate(zim_obj.mimeTypeList):
                            if re.search(mt_re, mt_name):
                                mime_type_indexes.append(mt_idx)

                    indexed_count = 0
                    logger.debug("Checking indexable against %d articles" % zim_obj.header['articleCount'])
                    for idx in xrange(zim_obj.header['articleCount']):
                        article_info = zim_obj.read_directory_entry_by_index(idx)
                        if article_info['mimetype'] in mime_type_indexes:
                            indexed_count += 1
                    zim_indexable[zim_fn] = indexed_count
    
                    # Store cache of indexable items in zim files
                    pickle.dump(zim_indexable, open(indexed_count_cache, "wb"))

            else:
                indexed_count = None

            ix_count = ix.doc_count()
            zim_count = zim_obj.header['articleCount']

            logging.debug("\t%d total in ZIM file" % zim_count)
            logging.debug("\t%d in index" % ix_count)
            if indexed_count != None:
                logging.debug("\t%d indexable in ZIM file" % indexed_count)

                if ix_count < indexed_count:
                    incomplete_indexes.append( (zim_fn, index_dir) )
                    logging.debug("\tincomplete index")
                else:
                    complete_indexes.append( (zim_fn, index_dir) )
                    logging.debug("\tcomplete index")
 
        logger.debug("")
    
    # Now report summary information
    # Now report summary information
    if len(complete_indexes) > 0:
        logger.info("----------------------")
        logger.info("Complete Index Files")
        logger.info("----------------------")
    elif zim_indexable != None:
        logger.info("--------------------------------")
        logger.info("Completed Indexes Not Computed")
        logger.info("--------------------------------")
    for zim_fn, index_dir in complete_indexes:
        logging.info(zim_fn)

    if len(incomplete_indexes) > 0:
        logger.info("----------------------")
        logger.info("Incomplete Index Files")
        logger.info("----------------------")
    elif zim_indexable != None:
        logger.info("--------------------------------")
        logger.info("Incompleted Indexes Not Computed")
        logger.info("--------------------------------")
    for zim_fn, index_dir in incomplete_indexes:
        logging.info(zim_fn)
                    
    if len(missing_indexes) > 0:
        logger.info("-------------------")
        logger.info("Missing Index Files")
        logger.info("-------------------")
    for zim_fn, index_dir in missing_indexes:
        logging.info(zim_fn)

    if len(empty_indexes) > 0:
        logger.info("--------------")
        logger.info("Index is Empty")
        logger.info("--------------")
    for zim_fn, index_dir in empty_indexes:
        logging.info(zim_fn)
Beispiel #3
0
def verify_indexes(zim_files, index_dir_base, indexed_count_cache=None, verbose=False):

    missing_indexes = []
    empty_indexes = []
    complete_indexes = []
    incomplete_indexes = []

    # Load a dictionary from the index cache file 
    if indexed_count_cache != None:
        if os.path.exists(indexed_count_cache):
            logger.debug("Loading existing indexable count cache: %s" % indexed_count_cache)
            zim_indexable = pickle.load(open(indexed_count_cache, "rb"))
        else:
            logger.debug("Opening new indexable count cache: %s" % indexed_count_cache)
            zim_indexable = {}
    else:
        zim_indexable = None

    for zim_fn in zim_files:
        index_dir = index_directory_path(index_dir_base, zim_fn)

        logging.debug("ZIM File: %s" % zim_fn)
        logging.debug("Index Dir: %s" % index_dir)

        if not os.path.exists(index_dir):
            logging.debug("\tIndex is missing\n")
            missing_indexes.append( (zim_fn, index_dir) )
            continue

        with nested(closing(ZimFile(zim_fn)), closing(open_dir(index_dir))) as (zim_obj, ix):

            if ix.is_empty():
                logger.debug("\tIndex exists but is empty\n")
                empty_indexes.append( (zim_fn, index_dir) )
                continue

            if zim_indexable != None:
                # Try to find indexable count from cache since it takes
                # awhile to compute these and they never change
                indexed_count = zim_indexable.get(zim_fn, None)
                if indexed_count == None:
                    mime_type_indexes = []
                    for mt_re in DEFAULT_MIME_TYPES:
                        for mt_idx, mt_name in enumerate(zim_obj.mimeTypeList):
                            if re.search(mt_re, mt_name):
                                mime_type_indexes.append(mt_idx)

                    indexed_count = 0
                    logger.debug("Checking indexable against %d articles" % zim_obj.header['articleCount'])
                    for idx in xrange(zim_obj.header['articleCount']):
                        article_info = zim_obj.read_directory_entry_by_index(idx)
                        if article_info['mimetype'] in mime_type_indexes:
                            indexed_count += 1
                    zim_indexable[zim_fn] = indexed_count
    
                    # Store cache of indexable items in zim files
                    pickle.dump(zim_indexable, open(indexed_count_cache, "wb"))

            else:
                indexed_count = None

            ix_count = ix.doc_count()
            zim_count = zim_obj.header['articleCount']

            logging.debug("\t%d total in ZIM file" % zim_count)
            logging.debug("\t%d in index" % ix_count)
            if indexed_count != None:
                logging.debug("\t%d indexable in ZIM file" % indexed_count)

                if ix_count < indexed_count:
                    incomplete_indexes.append( (zim_fn, index_dir) )
                    logging.debug("\tincomplete index")
                else:
                    complete_indexes.append( (zim_fn, index_dir) )
                    logging.debug("\tcomplete index")
 
        logger.debug("")
    
    # Now report summary information
    # Now report summary information
    if len(complete_indexes) > 0:
        logger.info("----------------------")
        logger.info("Complete Index Files")
        logger.info("----------------------")
    elif zim_indexable != None:
        logger.info("--------------------------------")
        logger.info("Completed Indexes Not Computed")
        logger.info("--------------------------------")
    for zim_fn, index_dir in complete_indexes:
        logging.info(zim_fn)

    if len(incomplete_indexes) > 0:
        logger.info("----------------------")
        logger.info("Incomplete Index Files")
        logger.info("----------------------")
    elif zim_indexable != None:
        logger.info("--------------------------------")
        logger.info("Incomplete Indexes Not Computed")
        logger.info("--------------------------------")
    for zim_fn, index_dir in incomplete_indexes:
        logging.info(zim_fn)
                    
    if len(missing_indexes) > 0:
        logger.info("-------------------")
        logger.info("Missing Index Files")
        logger.info("-------------------")
    for zim_fn, index_dir in missing_indexes:
        logging.info(zim_fn)

    if len(empty_indexes) > 0:
        logger.info("--------------")
        logger.info("Index is Empty")
        logger.info("--------------")
    for zim_fn, index_dir in empty_indexes:
        logging.info(zim_fn)