Ejemplo n.º 1
0
    def fixlinks(tag, choosen_pages):
        """Multiple postprocesses to the links"""

        # If there is an image inside the <a> tag, we remove the link but leave the child image
        child_img_tag = tag.find("img")
        if child_img_tag:
            tag.replace_with(child_img_tag)
            return

        link = tag.attrs.get('href')

        # Remove the <a> tag if there is no href
        if not link:
            tag.unwrap()
            return

        # this is a classic article link
        # get filename from link in same format as found in 'chosen_pages'
        if link.startswith(SEPLINK):
            fname = link[len(SEPLINK):]
            # remove fragment part if any
            fname = fname.split("#")[0]
            fname = urllib.parse.unquote(fname)
            fname = to3dirs.to_filename(fname)
            # if it was choosen, leave it as is
            if fname not in choosen_pages:
                # mark an unchoosen page with the 'nopo' class
                tag['class'] = tag.get('class', []) + ['nopo']
Ejemplo n.º 2
0
    def next(self):
        while True:
            if self.test_limit is not None:
                self.test_limit -= 1
                if self.test_limit <= 0:
                    raise StopIteration
            line = self.fh.readline().strip()
            if line == "":
                raise StopIteration
            if line == "page_title":
                continue
            basename = line.decode("utf-8").strip()
            path = os.path.join(self.dest_dir, to3dirs.to_path(basename))
            disk_name = os.path.join(path, to3dirs.to_filename(basename))
            if not os.path.exists(disk_name.encode('utf-8')):
                if not os.path.exists(path.encode('utf-8')):
                    os.makedirs(path.encode('utf-8'))

                quoted_url = urllib.quote(basename.encode('utf-8'))
                # Skip wikipedia automatic redirect
                wiki = WIKI % dict(lang=self.language)
                url = wiki + "w/index.php?title=%s&redirect=no" % (quoted_url,)
                data = DataURLs(url=url, temp_dir=self.temp_dir,
                                disk_name=disk_name, basename=basename)
                return data
Ejemplo n.º 3
0
    def test_to_filename(self):
        test_paths = (
          (u"*" + BARRA, u"*/"),
          (u"Anexo:*" + BARRA, u"Anexo:*/"),
          (BARRA + BARRA + u':Tr3s.Jeans', u'//:Tr3s.Jeans'),
        )

        for path, orig in test_paths:
            self.assertEqual(path, to_filename(orig))
Ejemplo n.º 4
0
    def test_to_pagina(self):
        test_paths = (
            u"*/",
            u"Anexo:*/",
            u'//:Tr3s.Jeans',
        )

        for s in test_paths:
            self.assertEqual(to_pagina(to_filename(s)), s)
Ejemplo n.º 5
0
def test_fixlinks_no_nopo(name):
    """Test that wiki links to included pages are not marked with the 'nopo' class."""
    fname = to3dirs.to_filename(name)
    chosen_pages = {fname}
    url = urllib.parse.quote(name)
    html = '<a href="/wiki/{}"></a>'.format(url)
    soup = bs4.BeautifulSoup(html, 'lxml')
    a_tag = soup.find('a')
    ImageParser.fixlinks(a_tag, chosen_pages)
    assert a_tag.attrs.get('class') is None
Ejemplo n.º 6
0
    def __init__(self, test=False):
        self.test = test
        self.to_download = {}
        self.process_now = {}
        self.dynamics = {}

        # get which files we processed last time for images and 'nopo' marks
        # (only if the articles are the same, otherwise we need to reprocess
        # everything because of the nopo marks)
        same_before = preprocess.pages_selector.same_info_through_runs
        self.processed_before = {}
        if not test and same_before and os.path.exists(config.LOG_IMAGPROC):
            with open(config.LOG_IMAGPROC, "rt", encoding="utf-8") as fh:
                for line in fh:
                    parts = line.strip().split(config.SEPARADOR_COLUMNAS)
                    dir3 = parts[0]
                    fname = parts[1]
                    dskurls = parts[2:]
                    self.processed_before[dir3, fname] = dskurls
        logger.debug("Records of images processed before: %d",
                     len(self.processed_before))

        # load information of planned downloads
        self.downloads_planned = {}
        if not test and os.path.exists(config.LOG_IMAGENES):
            with open(config.LOG_IMAGENES, "rt", encoding="utf-8") as fh:
                for line in fh:
                    dsk, web = line.strip().split(config.SEPARADOR_COLUMNAS)
                    self.downloads_planned[dsk] = web
        logger.debug("Records of images already planned to download: %d",
                     len(self.downloads_planned))

        self.imgs_ok = 0

        # load included files and its redirections
        sep = config.SEPARADOR_COLUMNAS
        self.chosen_pages = set()
        if not test:
            with open(config.PAG_ELEGIDAS, "rt", encoding="utf-8") as fh:
                self.chosen_pages = set(x.strip().split(sep)[1] for x in fh)
        logger.debug("Quantity of chosen pages, raw: %d",
                     len(self.chosen_pages))

        chpages = self.chosen_pages
        if not test:
            with open(config.LOG_REDIRECTS, "rt", encoding="utf-8") as fh:
                for line in fh:
                    orig, dest = line.strip().split(sep)
                    fname = to3dirs.to_filename(dest)
                    if fname in chpages:
                        chpages.add(orig)
        logger.debug("Quantity of chosen pages, including redirects: %d",
                     len(self.chosen_pages))
Ejemplo n.º 7
0
    def generar_bloques(cls, lang, verbose):
        cls._prep_archive_dir(lang)

        # import this here as it's not needed in production
        from src.preprocessing import preprocess

        # get all the articles, and store them in a dict using its block number, calculated
        # wiht a hash of the name
        top_pages = preprocess.pages_selector.top_pages
        logger.debug("Processing %d articles", len(top_pages))

        numBloques = len(top_pages) // cls.items_per_block + 1
        cls.guardarNumBloques(numBloques)
        bloques = {}
        all_filenames = set()
        for dir3, filename, _ in top_pages:
            # unquote special fielsystem chars
            filename_orig = urllib.parse.unquote(filename)
            all_filenames.add(filename_orig)
            bloqNum = utiles.coherent_hash(filename.encode('utf8')) % numBloques
            bloques.setdefault(bloqNum, []).append((dir3, filename))
            logger.debug("  files: %s %r %r", bloqNum, dir3, filename)

        # build the redirect dict, also separated by blocks to know where to find them
        redirects = {}
        for line in open(config.LOG_REDIRECTS, "rt", encoding="utf-8"):
            orig, dest = line.strip().split(config.SEPARADOR_COLUMNAS)

            # only keep this redirect if really points to an useful article (discarding any
            # possible 'fragment')
            only_name = dest.split("#")[0]
            if only_name not in all_filenames:
                continue

            # put it in a block
            bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques
            # target must be disk filename
            dest_filename = to3dirs.to_filename(dest)
            redirects.setdefault(bloqNum, []).append((orig, dest_filename))
            logger.debug("  redirs: %s %r %r", bloqNum, orig, dest_filename)

        # build each of the compressed blocks
        tot_archs = 0
        tot_redirs = 0
        for bloqNum, fileNames in bloques.items():
            tot_archs += len(fileNames)
            redirs_thisblock = redirects.get(bloqNum, [])
            tot_redirs += len(redirs_thisblock)
            Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose)

        return (len(bloques), tot_archs, tot_redirs)
Ejemplo n.º 8
0
def test_redirects_with_special_chars(index, data, mocker, title):
    """Check redirects to pages containing encoded special filesystem chars."""
    # only target chars should be quoted: '/', '.' and '%'
    filename = to3dirs.to_filename(title)
    with open(config.LOG_TITLES, 'at', encoding='utf-8') as fh:
        fh.write('{}|{}|\n'.format(filename, title))
    top_pages = [('f/o/o', filename, 10)]
    mocker.patch('src.preprocessing.preprocess.pages_selector',
                 mocker.Mock(top_pages=top_pages))
    with open(config.LOG_REDIRECTS, 'wt', encoding='utf-8') as fh:
        fh.write('redirect|{}\n'.format(title))

    cdpindex.generate_from_html(None, None)
    assert index.create.call_count == 1
    entries = list(index.create.call_args[0][1])
    assert len(entries) == 1
Ejemplo n.º 9
0
    def on_article(self, request, name):
        orig_link = utils.get_orig_link(name)
        # compressed article name contains special filesystem chars quoted
        filename = to3dirs.to_filename(name)
        try:
            data = self.art_mngr.get_item(filename)
        except Exception as err:
            raise InternalServerError("Error interno al buscar contenido: %s" % err)

        if data is None:
            raise ArticleNotFound(name, orig_link)

        return self.render_template('article.html',
                                    article_name=name,
                                    orig_link=orig_link,
                                    article=data,
                                    )
Ejemplo n.º 10
0
def load_test_infra_data():
    """Load data from TEST_INFRA_FILENAME."""
    _path = os.path.join(config.DIR_ASSETS, 'dynamic', TEST_INFRA_FILENAME)
    items = parse_test_infra_file(_path)
    total = len(items)
    data = []
    for number, (name, check) in enumerate(items, start=1):
        orig_link = get_orig_link(name)
        article_name = to3dirs.to_filename(name)
        item = {
            'article_name_unquoted': name,
            'article_name': article_name,
            'orig_link': orig_link,
            'number': number,
            'check': check,
            'total': total,
        }
        data.append(item)
    return data
Ejemplo n.º 11
0
 def test_roundtrip_simple(self):
     for word in ("moño", "foo/bar", "foo.bar"):
         r = to3dirs.to_pagina(to3dirs.to_filename(word))
         self.assertEqual(r, word)
Ejemplo n.º 12
0
 def test_encoding_slash(self):
     r = to3dirs.to_filename("foo/bar")
     self.assertEqual(r, "foo%2Fbar")
Ejemplo n.º 13
0
def _to_complete_path(pagina):
    return '/'.join((to_path(pagina), to_filename(pagina)))
Ejemplo n.º 14
0
 def test_encoding_dot(self):
     r = to3dirs.to_filename("foo.bar")
     self.assertEqual(r, "foo%2Ebar")
Ejemplo n.º 15
0
 def test_encoding_percent(self):
     r = to3dirs.to_filename("foo%bar")
     self.assertEqual(r, "foo%25bar")
Ejemplo n.º 16
0
 def test_roundtrip_simple(self):
     for word in ("moño", "foo/bar", "foo.bar"):
         r = to3dirs.to_pagina(to3dirs.to_filename(word))
         self.assertEqual(r, word)
Ejemplo n.º 17
0
 def test_roundtrip_crazy(self):
     word = "foo . bar / baz % more"
     r = to3dirs.to_pagina(to3dirs.to_filename(word))
     self.assertEqual(r, word)
Ejemplo n.º 18
0
 def test_encoding_nothing(self):
     r = to3dirs.to_filename("moño")
     self.assertEqual(r, "moño")
Ejemplo n.º 19
0
 def test_encoding_slash(self):
     r = to3dirs.to_filename("foo/bar")
     self.assertEqual(r, "foo%2Fbar")
Ejemplo n.º 20
0
 def test_roundtrip_crazy(self):
     word = "foo . bar / baz % more"
     r = to3dirs.to_pagina(to3dirs.to_filename(word))
     self.assertEqual(r, word)
Ejemplo n.º 21
0
 def test_encoding_nothing(self):
     r = to3dirs.to_filename("moño")
     self.assertEqual(r, "moño")
Ejemplo n.º 22
0
 def test_encoding_percent(self):
     r = to3dirs.to_filename("foo%bar")
     self.assertEqual(r, "foo%25bar")
Ejemplo n.º 23
0
 def test_encoding_dot(self):
     r = to3dirs.to_filename("foo.bar")
     self.assertEqual(r, "foo%2Ebar")