Example #1
0
    def generar_bloques(self, lang, verbose):
        self._prep_archive_dir(lang)

        # lo importamos acá porque no es necesario en producción
        from src.preproceso import preprocesar

        # pedir todos los articulos, y ordenarlos en un dict por
        # su numero de bloque, segun el hash
        fileNames = preprocesar.pages_selector.top_pages
        if verbose:
            print "Procesando", len(fileNames), "articulos"

        numBloques = len(fileNames) // self.items_per_block + 1
        self.guardarNumBloques(numBloques)
        bloques = {}
        all_filenames = set()
        for dir3, fileName, _ in fileNames:
            all_filenames.add(fileName)
            bloqNum = utiles.coherent_hash(
                fileName.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append((dir3, fileName))
            if verbose:
                print "  archs:", bloqNum, repr(dir3), repr(fileName)

        # armo el diccionario de redirects, también separados por bloques para
        # saber a dónde buscarlos
        redirects = {}
        for linea in codecs.open(config.LOG_REDIRECTS, "r", "utf-8"):
            orig, dest = linea.strip().split(config.SEPARADOR_COLUMNAS)

            # solamente nos quedamos con este redirect si realmente apunta a
            # un artículo útil (descartando el 'fragment' si hubiera)
            only_name = dest.split("#")[0]
            if only_name not in all_filenames:
                continue

            # metemos en bloque
            bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques
            redirects.setdefault(bloqNum, []).append((orig, dest))
            if verbose:
                print "  redirs:", bloqNum, repr(orig), repr(dest)

        # armamos cada uno de los comprimidos
        tot_archs = 0
        tot_redirs = 0
        for bloqNum, fileNames in bloques.items():
            tot_archs += len(fileNames)
            redirs_thisblock = redirects.get(bloqNum, [])
            tot_redirs += len(redirs_thisblock)
            Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose)

        return (len(bloques), tot_archs, tot_redirs)
Example #2
0
    def generar_bloques(self, lang, verbose):
        self._prep_archive_dir(lang)

        # lo importamos acá porque no es necesario en producción
        from src.preproceso import preprocesar

        # pedir todos los articulos, y ordenarlos en un dict por
        # su numero de bloque, segun el hash
        top_pages = preprocesar.pages_selector.top_pages
        if verbose:
            print "Procesando", len(top_pages), "articulos"

        numBloques = len(top_pages) // self.items_per_block + 1
        self.guardarNumBloques(numBloques)
        bloques = {}
        all_filenames = set()
        for dir3, filename, _, _ in top_pages:
            all_filenames.add(filename)
            bloqNum = utiles.coherent_hash(filename.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append((dir3, filename))
            if verbose:
                print "  archs:", bloqNum, repr(dir3), repr(filename)

        # armo el diccionario de redirects, también separados por bloques para
        # saber a dónde buscarlos
        redirects = {}
        for linea in codecs.open(config.LOG_REDIRECTS, "r", "utf-8"):
            orig, dest = linea.strip().split(config.SEPARADOR_COLUMNAS)

            # solamente nos quedamos con este redirect si realmente apunta a
            # un artículo útil (descartando el 'fragment' si hubiera)
            only_name = dest.split("#")[0]
            if only_name not in all_filenames:
                continue

            # metemos en bloque
            bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques
            redirects.setdefault(bloqNum, []).append((orig, dest))
            if verbose:
                print "  redirs:", bloqNum, repr(orig), repr(dest)

        # armamos cada uno de los comprimidos
        tot_archs = 0
        tot_redirs = 0
        for bloqNum, fileNames in bloques.items():
            tot_archs += len(fileNames)
            redirs_thisblock = redirects.get(bloqNum, [])
            tot_redirs += len(redirs_thisblock)
            Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose)

        return (len(bloques), tot_archs, tot_redirs)
Example #3
0
    def generar_bloques(cls, lang, verbose):
        cls._prep_archive_dir(lang)

        # import this here as it's not needed in production
        from src.preprocessing import preprocess

        # get all the articles, and store them in a dict using its block number, calculated
        # wiht a hash of the name
        top_pages = preprocess.pages_selector.top_pages
        logger.debug("Processing %d articles", len(top_pages))

        numBloques = len(top_pages) // cls.items_per_block + 1
        cls.guardarNumBloques(numBloques)
        bloques = {}
        all_filenames = set()
        for dir3, filename, _ in top_pages:
            # unquote special fielsystem chars
            filename_orig = urllib.parse.unquote(filename)
            all_filenames.add(filename_orig)
            bloqNum = utiles.coherent_hash(filename.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append((dir3, filename))
            logger.debug("  files: %s %r %r", bloqNum, dir3, filename)

        # build the redirect dict, also separated by blocks to know where to find them
        redirects = {}
        for line in open(config.LOG_REDIRECTS, "rt", encoding="utf-8"):
            orig, dest = line.strip().split(config.SEPARADOR_COLUMNAS)

            # only keep this redirect if really points to an useful article (discarding any
            # possible 'fragment')
            only_name = dest.split("#")[0]
            if only_name not in all_filenames:
                continue

            # put it in a block
            bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques
            # target must be disk filename
            dest_filename = to3dirs.to_filename(dest)
            redirects.setdefault(bloqNum, []).append((orig, dest_filename))
            logger.debug("  redirs: %s %r %r", bloqNum, orig, dest_filename)

        # build each of the compressed blocks
        tot_archs = 0
        tot_redirs = 0
        for bloqNum, fileNames in bloques.items():
            tot_archs += len(fileNames)
            redirs_thisblock = redirects.get(bloqNum, [])
            tot_redirs += len(redirs_thisblock)
            Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose)

        return (len(bloques), tot_archs, tot_redirs)
Example #4
0
    def generar_bloques(self, verbose):
        self._prep_archive_dir()

        # pedir todas las imágenes, y ordenarlos en un dict por
        # su numero de bloque, segun el hash
        fileNames = []
        for dirname, subdirs, files in os.walk(config.DIR_IMGSLISTAS):
            for f in files:
                name = os.path.join(dirname,
                                    f)[len(config.DIR_IMGSLISTAS) + 1:]
                fileNames.append(name)
        if verbose:
            print "Procesando", len(fileNames), "imágenes"

        numBloques = len(fileNames) // self.items_per_block + 1
        self.guardarNumBloques(numBloques)
        bloques = {}
        for fileName in fileNames:
            bloqNum = utiles.coherent_hash(
                fileName.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append(fileName)
            if verbose:
                print "  archs:", bloqNum, repr(fileName)

        tot = 0
        for bloqNum, fileNames in bloques.items():
            tot += len(fileNames)
            BloqueImagenes.crear(bloqNum, fileNames, verbose)

        return (len(bloques), tot)
Example #5
0
    def generar_bloques(cls, verbose):
        cls._prep_archive_dir()

        # get all the images, and store them in a dict using its block number, calculated
        # wiht a hash of the name
        fileNames = []
        for dirname, subdirs, files in os.walk(config.DIR_IMGSLISTAS):
            for f in files:
                name = os.path.join(dirname, f)[len(config.DIR_IMGSLISTAS) + 1:]
                fileNames.append(name)
        logger.debug("Processing %d images", len(fileNames))

        numBloques = len(fileNames) // cls.items_per_block + 1
        cls.guardarNumBloques(numBloques)
        bloques = {}
        for fileName in fileNames:
            bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append(fileName)
            logger.debug("  files: %s %r", bloqNum, fileName)

        tot = 0
        for bloqNum, fileNames in bloques.items():
            tot += len(fileNames)
            BloqueImagenes.crear(bloqNum, fileNames, verbose)

        return (len(bloques), tot)
Example #6
0
    def generar_bloques(self, verbose):
        self._prep_archive_dir()

        # pedir todas las imágenes, y ordenarlos en un dict por
        # su numero de bloque, segun el hash
        fileNames = []
        for dirname, subdirs, files in os.walk(config.DIR_IMGSLISTAS):
            for f in files:
                name = os.path.join(dirname, f)[len(config.DIR_IMGSLISTAS) + 1:]
                fileNames.append(name)
        if verbose:
            print "Procesando", len(fileNames), "imágenes"

        numBloques = len(fileNames) // self.items_per_block + 1
        self.guardarNumBloques(numBloques)
        bloques = {}
        for fileName in fileNames:
            bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append(fileName)
            if verbose:
                print "  archs:", bloqNum, repr(fileName)

        tot = 0
        for bloqNum, fileNames in bloques.items():
            tot += len(fileNames)
            BloqueImagenes.crear(bloqNum, fileNames, verbose)

        return (len(bloques), tot)
Example #7
0
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_cnter += 1
            ids_shelf[docid] = value

            # keys -> docid
            key_shelf.setdefault(key, set()).add(docid)

        # save key
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump(key_shelf, fh, 2)
        fh.close()

        # split ids_shelf in N dicts of about ~5k entries
        N = int(round(len(ids_shelf) / 5000.0))
        if not N:
            N = 1
        all_idshelves = [{} for i in range(N)]
        for k,v in ids_shelf.iteritems():
            cual = utiles.coherent_hash(k) % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        for cual, shelf in enumerate(all_idshelves):
            fname = "easyindex-%03d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            fh.close()

        return indexed_counter
Example #8
0
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_cnter += 1
            ids_shelf[docid] = value

            # keys -> docid
            key_shelf.setdefault(key, set()).add(docid)

        # save key
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump(key_shelf, fh, 2)
        fh.close()

        # split ids_shelf in N dicts of about ~5k entries
        N = int(round(len(ids_shelf) / 5000.0))
        if not N:
            N = 1
        all_idshelves = [{} for i in range(N)]
        for k, v in ids_shelf.iteritems():
            cual = utiles.coherent_hash(k) % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        for cual, shelf in enumerate(all_idshelves):
            fname = "easyindex-%03d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            fh.close()

        return indexed_counter
Example #9
0
 def get_item(self, fileName):
     """Get the item from inside of a block."""
     bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % self.num_bloques
     bloqName = "%08x%s" % (bloqNum, self.archive_extension)
     logger.debug("block: %s", bloqName)
     comp = self.getBloque(bloqName)
     item = comp.get_item(fileName)
     logger.debug("len item: %s", None if item is None else len(item))
     return item
Example #10
0
 def get_item(self, fileName):
     bloqNum = utiles.coherent_hash(fileName.encode('utf8')) % self.num_bloques
     bloqName = "%08x%s" % (bloqNum, self.archive_extension)
     if self.verbose:
         print "block:", bloqName
     comp = self.getBloque(bloqName)
     item = comp.get_item(fileName)
     if self.verbose and item is not None:
         print "len item:", len(item)
     return item
Example #11
0
    def _get_info_id(self, allids):
        '''Returns the values for the given ids.

        As it groups the ids according to the file, is much faster than
        retrieving one by one.
        '''
        # group the id per file
        cuales = {}
        for i in allids:
            cual = utiles.coherent_hash(i) % self.idfiles_count
            cuales.setdefault(cual, []).append(i)

        # get the info for each file
        for cual, ids in cuales.items():
            idx = self._get_ids_shelve(cual)
            for i in ids:
                yield idx[i]
Example #12
0
    def _get_info_id(self, allids):
        '''Returns the values for the given ids.

        As it groups the ids according to the file, is much faster than
        retrieving one by one.
        '''
        # group the id per file
        cuales = {}
        for i in allids:
            cual = utiles.coherent_hash(i) % self.idfiles_count
            cuales.setdefault(cual, []).append(i)

        # get the info for each file
        for cual, ids in cuales.items():
            idx = self._get_ids_shelve(cual)
            for i in ids:
                yield idx[i]