def __next__(self): while True: if self.test_limit is not None: self.test_limit -= 1 if self.test_limit <= 0: raise StopIteration line = self.fh.readline().strip() if line == "": raise StopIteration if line == "page_title": continue basename = line.strip() three_dirs, filename = to3dirs.get_path_file(basename) path = os.path.join(self.dest_dir, three_dirs) disk_name = os.path.join(path, filename) if not os.path.exists(disk_name.encode('utf-8')): if not os.path.exists(path.encode('utf-8')): os.makedirs(path.encode('utf-8')) quoted_url = urllib.parse.quote(basename) # Skip wikipedia automatic redirect wiki = WIKI % dict(lang=self.language) url = wiki + "w/index.php?title=%s&redirect=no" % ( quoted_url, ) data = DataURLs(url=url, temp_dir=self.temp_dir, disk_name=disk_name, basename=basename) return data
def scrap_portal(language, lang_config): """Get the portal index and scrap it.""" # get the portal url, get out if don't have it portal_index_title = lang_config.get('portal_index') if portal_index_title is None: logger.info("Not scraping portals, url not configured.") return logger.info("Scraping portal main page %s", portal_index_title) with NamedTemporaryFile('wt', encoding='utf8', dir='/tmp/', prefix='cdpedia-') as tf: tf.write(portal_index_title + '\n') tf.flush() _call_scraper(language, tf.name) dir3, quoted_page = to3dirs.get_path_file(portal_index_title) portal_filepath = os.path.join(location.articles, dir3, quoted_page) logger.info("Parsing portal page") with open(portal_filepath, 'rt', encoding='utf8') as fh: soup = bs4.BeautifulSoup(fh, features="html.parser") cnt = 0 _path = os.path.join(location.langdir, PORTAL_PAGES) with open(_path, 'wt', encoding='utf8') as fh: for page in preprocessors.extract_pages(soup): cnt += 1 fh.write(page + '\n') logger.info("Scraping portal sub pages (total=%d)", cnt) _call_scraper(language, _path) logger.info("Portal scraping done")
def to_filename(title): """Compute the filename from the title.""" tt = title.replace(" ", "_") if len(tt) >= 2: tt = tt[0].upper() + tt[1:] elif len(tt) == 1: tt = tt[0].upper() else: raise ValueError("Title must have at least one character") dir3, arch = to3dirs.get_path_file(tt) expected = os.path.join(dir3, arch) return expected
def top_pages(self): """Calculate the HTMLs with more score and store both lists.""" all_pages = [] colsep = config.SEPARADOR_COLUMNAS with PATH_TEMP.joinpath("page_scores_final.txt").open( "rt", encoding='utf8') as fh: for line in fh: page, score = line.strip().split(colsep) dir3, fname = to3dirs.get_path_file(page) all_pages.append((dir3, fname, int(score))) # order by score, and get top N all_pages.sort(key=operator.itemgetter(2), reverse=True) return all_pages
def calculate(self): """Calculate the HTMLs with more score and store both lists.""" self._calculated = True # read the preprocessed file all_pages = [] colsep = config.SEPARADOR_COLUMNAS with codecs.open(LOG_SCORES_FINAL, 'rt', encoding='utf8') as fh: for line in fh: page, score = line.strip().split(colsep) dir3, fname = to3dirs.get_path_file(page) all_pages.append((dir3, fname, page, int(score))) # order by score, and get top N all_pages.sort(key=operator.itemgetter(3), reverse=True) page_limit = config.imageconf['page_limit'] self._top_pages = all_pages[:page_limit] # get all items after N that still has the same score that last one last_score = self._top_pages[-1][3] for more_info in all_pages[page_limit:]: if more_info[3] == last_score: self._top_pages.append(more_info) separator = config.SEPARADOR_COLUMNAS if os.path.exists(config.PAG_ELEGIDAS): # previous run for this info! same content? with codecs.open(config.PAG_ELEGIDAS, "rt", "utf8") as fh: old_stuff = [] for linea in fh: dir3, arch, score = linea.strip().split(separator) old_stuff.append((dir3, arch, int(score))) if sorted(old_stuff) == sorted(self._top_pages): self._same_info_through_runs = True if not self._same_info_through_runs: # previous info not there, or different: write to disk with codecs.open(config.PAG_ELEGIDAS, "wt", "utf8") as fh: for dir3, fname, page, score in self._top_pages: info = (dir3, page, str(score)) fh.write(separator.join(info) + "\n")
def get_data_urls(listado_nombres, dest_dir, language, test_limit=None): """Get a list of DataURLs to download (verifying which are already downloaded).""" logger.info('Generando DataURLs') wiki_base = WIKI.format(language=language) temp_dir = dest_dir + ".tmp" if not os.path.exists(temp_dir): os.makedirs(temp_dir) data_urls_list = [] previous_count = 0 with open(listado_nombres, 'rt', encoding='utf-8') as fh: for line in fh: if test_limit is not None: test_limit -= 1 if test_limit <= 0: break line = line.strip() if line == "page_title": continue basename = line.strip() three_dirs, filename = to3dirs.get_path_file(basename) path = os.path.join(dest_dir, three_dirs) disk_name = os.path.join(path, filename) if os.path.exists(disk_name): previous_count += 1 continue if not os.path.exists(path): os.makedirs(path) quoted_url = urllib.parse.quote(basename) # Skip wikipedia automatic redirect url = "{}/w/index.php?title={}&redirect=no".format(wiki_base, quoted_url) data = DataURLs(url=url, temp_dir=temp_dir, disk_name=disk_name, basename=basename) data_urls_list.append(data) return (previous_count, data_urls_list)
def test_encoding(self): r = to3dirs.get_path_file("2.3") self.assertEqual(r, ("2/%/2", "2%2E3"))
def test_very_short(self): r = to3dirs.get_path_file("m") self.assertEqual(r, ("m/_/_", "m"))
def test_short(self): r = to3dirs.get_path_file("mo") self.assertEqual(r, ("m/o/_", "mo"))
def test_simple(self): r = to3dirs.get_path_file("moño") self.assertEqual(r, ("m/o/ñ", "moño"))