Ejemplo n.º 1
0
    def __next__(self):
        while True:
            if self.test_limit is not None:
                self.test_limit -= 1
                if self.test_limit <= 0:
                    raise StopIteration
            line = self.fh.readline().strip()
            if line == "":
                raise StopIteration
            if line == "page_title":
                continue
            basename = line.strip()
            three_dirs, filename = to3dirs.get_path_file(basename)
            path = os.path.join(self.dest_dir, three_dirs)
            disk_name = os.path.join(path, filename)
            if not os.path.exists(disk_name.encode('utf-8')):
                if not os.path.exists(path.encode('utf-8')):
                    os.makedirs(path.encode('utf-8'))

                quoted_url = urllib.parse.quote(basename)
                # Skip wikipedia automatic redirect
                wiki = WIKI % dict(lang=self.language)
                url = wiki + "w/index.php?title=%s&redirect=no" % (
                    quoted_url, )
                data = DataURLs(url=url,
                                temp_dir=self.temp_dir,
                                disk_name=disk_name,
                                basename=basename)
                return data
Ejemplo n.º 2
0
def scrap_portal(language, lang_config):
    """Get the portal index and scrap it."""
    # get the portal url, get out if don't have it
    portal_index_title = lang_config.get('portal_index')
    if portal_index_title is None:
        logger.info("Not scraping portals, url not configured.")
        return

    logger.info("Scraping portal main page %s", portal_index_title)
    with NamedTemporaryFile('wt',
                            encoding='utf8',
                            dir='/tmp/',
                            prefix='cdpedia-') as tf:
        tf.write(portal_index_title + '\n')
        tf.flush()
        _call_scraper(language, tf.name)

    dir3, quoted_page = to3dirs.get_path_file(portal_index_title)
    portal_filepath = os.path.join(location.articles, dir3, quoted_page)

    logger.info("Parsing portal page")
    with open(portal_filepath, 'rt', encoding='utf8') as fh:
        soup = bs4.BeautifulSoup(fh, features="html.parser")

    cnt = 0
    _path = os.path.join(location.langdir, PORTAL_PAGES)
    with open(_path, 'wt', encoding='utf8') as fh:
        for page in preprocessors.extract_pages(soup):
            cnt += 1
            fh.write(page + '\n')

    logger.info("Scraping portal sub pages (total=%d)", cnt)
    _call_scraper(language, _path)

    logger.info("Portal scraping done")
Ejemplo n.º 3
0
def to_filename(title):
    """Compute the filename from the title."""
    tt = title.replace(" ", "_")
    if len(tt) >= 2:
        tt = tt[0].upper() + tt[1:]
    elif len(tt) == 1:
        tt = tt[0].upper()
    else:
        raise ValueError("Title must have at least one character")

    dir3, arch = to3dirs.get_path_file(tt)
    expected = os.path.join(dir3, arch)
    return expected
Ejemplo n.º 4
0
    def top_pages(self):
        """Calculate the HTMLs with more score and store both lists."""
        all_pages = []
        colsep = config.SEPARADOR_COLUMNAS
        with PATH_TEMP.joinpath("page_scores_final.txt").open(
                "rt", encoding='utf8') as fh:
            for line in fh:
                page, score = line.strip().split(colsep)
                dir3, fname = to3dirs.get_path_file(page)
                all_pages.append((dir3, fname, int(score)))

        # order by score, and get top N
        all_pages.sort(key=operator.itemgetter(2), reverse=True)
        return all_pages
Ejemplo n.º 5
0
    def calculate(self):
        """Calculate the HTMLs with more score and store both lists."""
        self._calculated = True

        # read the preprocessed file
        all_pages = []
        colsep = config.SEPARADOR_COLUMNAS
        with codecs.open(LOG_SCORES_FINAL, 'rt', encoding='utf8') as fh:
            for line in fh:
                page, score = line.strip().split(colsep)
                dir3, fname = to3dirs.get_path_file(page)
                all_pages.append((dir3, fname, page, int(score)))

        # order by score, and get top N
        all_pages.sort(key=operator.itemgetter(3), reverse=True)
        page_limit = config.imageconf['page_limit']
        self._top_pages = all_pages[:page_limit]

        # get all items after N that still has the same score that last one
        last_score = self._top_pages[-1][3]
        for more_info in all_pages[page_limit:]:
            if more_info[3] == last_score:
                self._top_pages.append(more_info)

        separator = config.SEPARADOR_COLUMNAS
        if os.path.exists(config.PAG_ELEGIDAS):
            # previous run for this info! same content?
            with codecs.open(config.PAG_ELEGIDAS, "rt", "utf8") as fh:
                old_stuff = []
                for linea in fh:
                    dir3, arch, score = linea.strip().split(separator)
                    old_stuff.append((dir3, arch, int(score)))
                if sorted(old_stuff) == sorted(self._top_pages):
                    self._same_info_through_runs = True

        if not self._same_info_through_runs:
            # previous info not there, or different: write to disk
            with codecs.open(config.PAG_ELEGIDAS, "wt", "utf8") as fh:
                for dir3, fname, page, score in self._top_pages:
                    info = (dir3, page, str(score))
                    fh.write(separator.join(info) + "\n")
Ejemplo n.º 6
0
def get_data_urls(listado_nombres, dest_dir, language, test_limit=None):
    """Get a list of DataURLs to download (verifying which are already downloaded)."""
    logger.info('Generando DataURLs')
    wiki_base = WIKI.format(language=language)

    temp_dir = dest_dir + ".tmp"
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    data_urls_list = []
    previous_count = 0

    with open(listado_nombres, 'rt', encoding='utf-8') as fh:
        for line in fh:
            if test_limit is not None:
                test_limit -= 1
                if test_limit <= 0:
                    break
            line = line.strip()
            if line == "page_title":
                continue
            basename = line.strip()
            three_dirs, filename = to3dirs.get_path_file(basename)
            path = os.path.join(dest_dir, three_dirs)
            disk_name = os.path.join(path, filename)
            if os.path.exists(disk_name):
                previous_count += 1
                continue

            if not os.path.exists(path):
                os.makedirs(path)
            quoted_url = urllib.parse.quote(basename)
            # Skip wikipedia automatic redirect
            url = "{}/w/index.php?title={}&redirect=no".format(wiki_base, quoted_url)
            data = DataURLs(url=url, temp_dir=temp_dir, disk_name=disk_name, basename=basename)
            data_urls_list.append(data)

        return (previous_count, data_urls_list)
Ejemplo n.º 7
0
 def test_encoding(self):
     r = to3dirs.get_path_file("2.3")
     self.assertEqual(r, ("2/%/2", "2%2E3"))
Ejemplo n.º 8
0
 def test_very_short(self):
     r = to3dirs.get_path_file("m")
     self.assertEqual(r, ("m/_/_", "m"))
Ejemplo n.º 9
0
 def test_short(self):
     r = to3dirs.get_path_file("mo")
     self.assertEqual(r, ("m/o/_", "mo"))
Ejemplo n.º 10
0
 def test_simple(self):
     r = to3dirs.get_path_file("moño")
     self.assertEqual(r, ("m/o/ñ", "moño"))