def fixlinks(tag, choosen_pages): """Multiple postprocesses to the links""" # If there is an image inside the <a> tag, we remove the link but leave the child image child_img_tag = tag.find("img") if child_img_tag: tag.replace_with(child_img_tag) return link = tag.attrs.get('href') # Remove the <a> tag if there is no href if not link: tag.unwrap() return # this is a classic article link # get filename from link in same format as found in 'chosen_pages' if link.startswith(SEPLINK): fname = link[len(SEPLINK):] # remove fragment part if any fname = fname.split("#")[0] fname = urllib.parse.unquote(fname) fname = to3dirs.to_filename(fname) # if it was choosen, leave it as is if fname not in choosen_pages: # mark an unchoosen page with the 'nopo' class tag['class'] = tag.get('class', []) + ['nopo']
def next(self): while True: if self.test_limit is not None: self.test_limit -= 1 if self.test_limit <= 0: raise StopIteration line = self.fh.readline().strip() if line == "": raise StopIteration if line == "page_title": continue basename = line.decode("utf-8").strip() path = os.path.join(self.dest_dir, to3dirs.to_path(basename)) disk_name = os.path.join(path, to3dirs.to_filename(basename)) if not os.path.exists(disk_name.encode('utf-8')): if not os.path.exists(path.encode('utf-8')): os.makedirs(path.encode('utf-8')) quoted_url = urllib.quote(basename.encode('utf-8')) # Skip wikipedia automatic redirect wiki = WIKI % dict(lang=self.language) url = wiki + "w/index.php?title=%s&redirect=no" % (quoted_url,) data = DataURLs(url=url, temp_dir=self.temp_dir, disk_name=disk_name, basename=basename) return data
def test_to_filename(self): test_paths = ( (u"*" + BARRA, u"*/"), (u"Anexo:*" + BARRA, u"Anexo:*/"), (BARRA + BARRA + u':Tr3s.Jeans', u'//:Tr3s.Jeans'), ) for path, orig in test_paths: self.assertEqual(path, to_filename(orig))
def test_to_pagina(self): test_paths = ( u"*/", u"Anexo:*/", u'//:Tr3s.Jeans', ) for s in test_paths: self.assertEqual(to_pagina(to_filename(s)), s)
def test_fixlinks_no_nopo(name): """Test that wiki links to included pages are not marked with the 'nopo' class.""" fname = to3dirs.to_filename(name) chosen_pages = {fname} url = urllib.parse.quote(name) html = '<a href="/wiki/{}"></a>'.format(url) soup = bs4.BeautifulSoup(html, 'lxml') a_tag = soup.find('a') ImageParser.fixlinks(a_tag, chosen_pages) assert a_tag.attrs.get('class') is None
def __init__(self, test=False): self.test = test self.to_download = {} self.process_now = {} self.dynamics = {} # get which files we processed last time for images and 'nopo' marks # (only if the articles are the same, otherwise we need to reprocess # everything because of the nopo marks) same_before = preprocess.pages_selector.same_info_through_runs self.processed_before = {} if not test and same_before and os.path.exists(config.LOG_IMAGPROC): with open(config.LOG_IMAGPROC, "rt", encoding="utf-8") as fh: for line in fh: parts = line.strip().split(config.SEPARADOR_COLUMNAS) dir3 = parts[0] fname = parts[1] dskurls = parts[2:] self.processed_before[dir3, fname] = dskurls logger.debug("Records of images processed before: %d", len(self.processed_before)) # load information of planned downloads self.downloads_planned = {} if not test and os.path.exists(config.LOG_IMAGENES): with open(config.LOG_IMAGENES, "rt", encoding="utf-8") as fh: for line in fh: dsk, web = line.strip().split(config.SEPARADOR_COLUMNAS) self.downloads_planned[dsk] = web logger.debug("Records of images already planned to download: %d", len(self.downloads_planned)) self.imgs_ok = 0 # load included files and its redirections sep = config.SEPARADOR_COLUMNAS self.chosen_pages = set() if not test: with open(config.PAG_ELEGIDAS, "rt", encoding="utf-8") as fh: self.chosen_pages = set(x.strip().split(sep)[1] for x in fh) logger.debug("Quantity of chosen pages, raw: %d", len(self.chosen_pages)) chpages = self.chosen_pages if not test: with open(config.LOG_REDIRECTS, "rt", encoding="utf-8") as fh: for line in fh: orig, dest = line.strip().split(sep) fname = to3dirs.to_filename(dest) if fname in chpages: chpages.add(orig) logger.debug("Quantity of chosen pages, including redirects: %d", len(self.chosen_pages))
def generar_bloques(cls, lang, verbose): cls._prep_archive_dir(lang) # import this here as it's not needed in production from src.preprocessing import preprocess # get all the articles, and store them in a dict using its block number, calculated # wiht a hash of the name top_pages = preprocess.pages_selector.top_pages logger.debug("Processing %d articles", len(top_pages)) numBloques = len(top_pages) // cls.items_per_block + 1 cls.guardarNumBloques(numBloques) bloques = {} all_filenames = set() for dir3, filename, _ in top_pages: # unquote special fielsystem chars filename_orig = urllib.parse.unquote(filename) all_filenames.add(filename_orig) bloqNum = utiles.coherent_hash(filename.encode('utf8')) % numBloques bloques.setdefault(bloqNum, []).append((dir3, filename)) logger.debug(" files: %s %r %r", bloqNum, dir3, filename) # build the redirect dict, also separated by blocks to know where to find them redirects = {} for line in open(config.LOG_REDIRECTS, "rt", encoding="utf-8"): orig, dest = line.strip().split(config.SEPARADOR_COLUMNAS) # only keep this redirect if really points to an useful article (discarding any # possible 'fragment') only_name = dest.split("#")[0] if only_name not in all_filenames: continue # put it in a block bloqNum = utiles.coherent_hash(orig.encode('utf8')) % numBloques # target must be disk filename dest_filename = to3dirs.to_filename(dest) redirects.setdefault(bloqNum, []).append((orig, dest_filename)) logger.debug(" redirs: %s %r %r", bloqNum, orig, dest_filename) # build each of the compressed blocks tot_archs = 0 tot_redirs = 0 for bloqNum, fileNames in bloques.items(): tot_archs += len(fileNames) redirs_thisblock = redirects.get(bloqNum, []) tot_redirs += len(redirs_thisblock) Comprimido.crear(redirs_thisblock, bloqNum, fileNames, verbose) return (len(bloques), tot_archs, tot_redirs)
def test_redirects_with_special_chars(index, data, mocker, title): """Check redirects to pages containing encoded special filesystem chars.""" # only target chars should be quoted: '/', '.' and '%' filename = to3dirs.to_filename(title) with open(config.LOG_TITLES, 'at', encoding='utf-8') as fh: fh.write('{}|{}|\n'.format(filename, title)) top_pages = [('f/o/o', filename, 10)] mocker.patch('src.preprocessing.preprocess.pages_selector', mocker.Mock(top_pages=top_pages)) with open(config.LOG_REDIRECTS, 'wt', encoding='utf-8') as fh: fh.write('redirect|{}\n'.format(title)) cdpindex.generate_from_html(None, None) assert index.create.call_count == 1 entries = list(index.create.call_args[0][1]) assert len(entries) == 1
def on_article(self, request, name): orig_link = utils.get_orig_link(name) # compressed article name contains special filesystem chars quoted filename = to3dirs.to_filename(name) try: data = self.art_mngr.get_item(filename) except Exception as err: raise InternalServerError("Error interno al buscar contenido: %s" % err) if data is None: raise ArticleNotFound(name, orig_link) return self.render_template('article.html', article_name=name, orig_link=orig_link, article=data, )
def load_test_infra_data(): """Load data from TEST_INFRA_FILENAME.""" _path = os.path.join(config.DIR_ASSETS, 'dynamic', TEST_INFRA_FILENAME) items = parse_test_infra_file(_path) total = len(items) data = [] for number, (name, check) in enumerate(items, start=1): orig_link = get_orig_link(name) article_name = to3dirs.to_filename(name) item = { 'article_name_unquoted': name, 'article_name': article_name, 'orig_link': orig_link, 'number': number, 'check': check, 'total': total, } data.append(item) return data
def test_roundtrip_simple(self): for word in ("moño", "foo/bar", "foo.bar"): r = to3dirs.to_pagina(to3dirs.to_filename(word)) self.assertEqual(r, word)
def test_encoding_slash(self): r = to3dirs.to_filename("foo/bar") self.assertEqual(r, "foo%2Fbar")
def _to_complete_path(pagina): return '/'.join((to_path(pagina), to_filename(pagina)))
def test_encoding_dot(self): r = to3dirs.to_filename("foo.bar") self.assertEqual(r, "foo%2Ebar")
def test_encoding_percent(self): r = to3dirs.to_filename("foo%bar") self.assertEqual(r, "foo%25bar")
def test_roundtrip_crazy(self): word = "foo . bar / baz % more" r = to3dirs.to_pagina(to3dirs.to_filename(word)) self.assertEqual(r, word)
def test_encoding_nothing(self): r = to3dirs.to_filename("moño") self.assertEqual(r, "moño")