Ejemplo n.º 1
0
def run():
    """Extract the images from htmls, and also do extra work on those pages."""
    preprocessed = preprocess.pages_selector.top_pages
    pi = ImageParser()
    total = len(preprocessed)
    logger.info("Image parser inited")

    logger.info("Normal pages: %d pages to process", total)
    done = 0
    tl = utiles.TimingLogger(30, logger.debug)

    for dir3, fname, _ in preprocessed:
        try:
            pi.parse(dir3, fname)
        except Exception:
            logger.exception("Parsing crashed in dir3=%r fname=%r", dir3,
                             fname)
            raise

        done += 1
        tl.log("Parsing found %d images so far (%d of %d pages)", pi.quant,
               done, total)

    pi.dump()
    return pi.imgs_ok, pi.quant
Ejemplo n.º 2
0
def run():
    """Extract the images from htmls, and also do extra work on those pages."""
    preprocesados = preprocesar.pages_selector.top_pages
    pi = ImageParser()
    total = len(preprocesados)
    logger.info("Image parser inited")

    logger.info("Extract images from special resources.")
    pi.process_dynamics(
        'portals', os.path.join(config.DIR_ASSETS, 'dynamic', 'portals.html'))

    logger.info("Normal pages: %d pages to process", total)
    done = 0
    tl = utiles.TimingLogger(30, logger.debug)
    for dir3, fname, _ in preprocesados:
        try:
            pi.parse(dir3, fname)
        except:
            logger.exception("Parsing crashed in dir3=%r fname=%r", dir3,
                             fname)
            raise

        done += 1
        tl.log("Parsing found %d images so far (%d of %d pages)", pi.cant,
               done, total)

    pi.dump()
    return pi.imgs_ok, pi.cant
Ejemplo n.º 3
0
def retrieve():
    """Download the images from the net."""
    lista_descargar = []

    # vemos cuales tuvieron problemas antes
    log_errores = os.path.join(config.DIR_TEMP, "imagenes_neterror.txt")
    if os.path.exists(log_errores):
        with codecs.open(log_errores, "r", "utf8") as fh:
            imgs_problemas = set(x.strip() for x in fh)
    else:
        imgs_problemas = set()

    for linea in codecs.open(config.LOG_REDUCCION, "r", "utf8"):
        linea = linea.strip()
        if not linea:
            continue

        _, arch, url = linea.split(config.SEPARADOR_COLUMNAS)
        fullpath = os.path.join(config.DIR_TEMP, "images", arch)

        if url not in imgs_problemas and not os.path.exists(fullpath):
            lista_descargar.append((url, fullpath))

    tot = len(lista_descargar)
    p = repartidor.Pool(descargar, 5)
    tl = utiles.TimingLogger(30, logger.debug)
    errores = collections.Counter()
    c_ok = 0
    c_err = 0
    for i, result in enumerate(p.procesa(lista_descargar), 1):
        (url, fullpath), stt = result
        if stt is None:
            c_ok += 1
        else:
            errores[stt] += 1
            c_err += 1
            with codecs.open(log_errores, "a", "utf8") as fh:
                fh.write(url + "\n")

        tl.log("Downloaded image %d/%d (ok=%d, err=%d)", i, tot, c_ok, c_err)

    for code, cant in errores.most_common():
        logger.warning("Had errors: code=%r quant=%d", code, cant)
Ejemplo n.º 4
0
    def procesar(self):
        resultados = self.resultados
        puntaje_extra = {}
        de_antes = 0

        # get the total of directories to parse
        total_dirs = sum(1 for _ in os.walk(self.origen))
        logger.info("Quantity of directories to process: %d", total_dirs)

        count = 0
        tl = utiles.TimingLogger(30, logger.debug)
        for cwd, directorios, archivos in os.walk(self.origen):
            partes_dir = cwd.split(os.path.sep)
            ult3dirs = join(*partes_dir[-3:])
            count += 1

            if len(ult3dirs) != 5:  # ej: u"M/a/n"
                # we're not in a leaf, we shouldn't have any files
                if archivos:
                    logger.warning(
                        "We have content in a non-leaf "
                        "directory: %s", archivos)
                continue
            tl.log("Processing %s (%d/%d)", ult3dirs, count, total_dirs)

            for pag in archivos:
                if " " in pag:
                    logger.warning("Have names with spaces! %s %s", ult3dirs,
                                   pag)
                # vemos si lo teníamos de antes
                if pag in resultados:
                    de_antes += 1
                    continue
                if pag in self.descartados_antes:
                    continue

                wikiarchivo = WikiArchivo(cwd, ult3dirs, pag)
                resultados[pag] = {}
                resultados[pag]["dir3"] = ult3dirs

                for procesador in self.preprocesadores:
                    (puntaje, otras_pags) = procesador(wikiarchivo)

                    # agregamos el puntaje extra
                    for extra_pag, extra_ptje in otras_pags:
                        if extra_pag in resultados:
                            prev = resultados[extra_pag].get(procesador, 0)
                            resultados[extra_pag][
                                procesador] = prev + extra_ptje
                        else:
                            ant = puntaje_extra.setdefault(extra_pag, {})
                            ant[procesador] = ant.get(procesador,
                                                      0) + extra_ptje

                    # None significa que el procesador lo marcó para omitir
                    if puntaje is None:
                        del resultados[pag]
                        self.descartados_file.write("%s\n" % pag)
                        break

                    # ponemos el puntaje
                    if puntaje != 0:
                        resultados[pag][procesador] = puntaje

                else:
                    # lo guardamos sólo si no fue descartado
                    wikiarchivo.guardar()

        for procesador in self.preprocesadores:
            procesador.close()
            logger.debug("Preprocessor %17s usage stats: %s",
                         procesador.nombre, procesador.stats)

        # cargamos los redirects para tenerlos en cuenta
        redirects = {}
        sepcol = config.SEPARADOR_COLUMNAS
        with codecs.open(config.LOG_REDIRECTS, "r", "utf-8") as fh:
            for linea in fh:
                r_from, r_to = linea.strip().split(sepcol)
                redirects[r_from] = r_to

        # agregamos el puntaje extra sólo si ya teníamos las páginas con nos
        logger.debug("Distributing extra scope: %d", len(puntaje_extra))
        perdidos = []
        for (pag, puntajes) in puntaje_extra.items():
            # desreferenciamos el redirect, vaciando el diccionario para
            # evitar loops
            while pag in redirects:
                pag = redirects.pop(pag)

            # asignamos los puntajes para las páginas que están
            if pag in resultados:
                for (proc, ptje) in puntajes.items():
                    resultados[pag][proc] = resultados[pag].get(proc, 0) + ptje
            else:
                perdidos.append((pag, puntajes))
        if perdidos:
            logger.warning("Lost %d scores!", len(perdidos))
            fname = join(config.DIR_TEMP, 'perdidos.txt')
            with codecs.open(fname, 'w', 'utf8') as fh:
                for pag in perdidos:
                    fh.write(u"%s\n" % (pag, ))

        return len(resultados) - de_antes, de_antes
Ejemplo n.º 5
0
    def process(self):
        """Process all pages under a root directory."""
        # let's see what was processed from before, and open the log file to keep adding
        if os.path.exists(config.LOG_PREPROCESADO):
            with codecs.open(config.LOG_PREPROCESADO, "rt", "utf8") as fh:
                processed_before_set = set(x.strip() for x in fh)
        else:
            processed_before_set = set()
        processed_before_log = codecs.open(config.LOG_PREPROCESADO, "at", "utf8")

        # get the total of directories to parse
        logger.info("Getting how many pages under root dir")
        total_pages = sum(len(filenames) for _, _, filenames in os.walk(self.origen))
        logger.info("Quantity of pages to process: %d", total_pages)

        # open the scores file to keep adding
        scores_log = codecs.open(LOG_SCORES_ACCUM, "at", "utf8")

        count_processed = count_new_ok = count_new_discarded = count_old_before = 0
        tl = utiles.TimingLogger(30, logger.debug)
        for cwd, _, filenames in os.walk(self.origen):
            parts_dir = cwd.split(os.path.sep)
            last3dirs = join(*parts_dir[-3:])

            if len(last3dirs) != 5:  # ej: u"M/a/n"
                # we're not in a leaf, we shouldn't have any files
                if filenames:
                    logger.warning("We have content in a non-leaf directory: %s %s",
                                   last3dirs, filenames)
                continue

            for page_path in filenames:
                count_processed += 1
                tl.log("Processing %s (%d/%d)", last3dirs, count_processed, total_pages)

                if " " in page_path:
                    logger.warning("Have names with spaces! %s %s", last3dirs, page_path)

                # check if the page was processed or discarded before
                if page_path in processed_before_set:
                    count_old_before += 1
                    continue

                wikipage = WikiArchivo(cwd, last3dirs, page_path)

                this_total_score = 0
                other_pages_scores = []
                for procesador in self.preprocesadores:
                    tini = time.time()
                    try:
                        (this_score, other_scores) = procesador(wikipage)
                    except:
                        logger.error("Processor %s crashed on page %r", procesador, page_path)
                        raise
                    self.prof_times[procesador] += time.time() - tini
                    self.prof_quant[procesador] += 1

                    # keep the score for other pages (check before to avoid a bogus function call)
                    if other_scores:
                        other_pages_scores.extend(other_scores)

                    if this_score is None:
                        # the processor indicated to discard this page
                        count_new_discarded += 1
                        break

                    # keep the score for this page
                    this_total_score += this_score

                else:
                    # all processors done, page not discarded
                    count_new_ok += 1
                    wikipage.guardar()

                    # save the real page score
                    scores_log.write("{}|R|{:d}\n".format(
                        to3dirs.to_pagina(page_path), this_total_score))

                    # save the extra pages score (that may exist or not in the dump)
                    for extra_page, extra_score in other_pages_scores:
                        scores_log.write("{}|E|{:d}\n".format(extra_page, extra_score))

                # with score or discarded, log it as processed
                processed_before_log.write(page_path + "\n")

        # all processing done for all the pages
        logger.info("Processed pages: %d new ok, %d discarded, %d already processed before",
                    count_new_ok, count_new_discarded, count_old_before)
        scores_log.close()
        processed_before_log.close()
        for procesador in self.preprocesadores:
            procesador.close()
            logger.debug("Preprocessor %17s usage stats: %s", procesador.nombre, procesador.stats)