Exemple #1
0
def dump_page(source: str,
              target_folder: Union[Path, str] = "pages",
              wiki_obj: Wikipedia = None,
              lang: str = 'fr'):
    if not wiki_obj:
        wiki_obj = Wikipedia(lang)

    target_folder = Path(target_folder)
    if not target_folder.exists():
        makedirs(target_folder)

    wikipage = wiki_obj.page(source)
    if not wikipage.exists():
        print(f"page {source} does not exist")

    else:
        page_info = wiki_obj.info(wikipage)
        if page_info.title != wikipage.title:
            wikipage = wiki_obj.page(page_info.title)
        wiki_title = wikipage.title.replace(' ', '_')
        target_file = target_folder / (wiki_title.replace("/", "__SLASH__") +
                                       ".pkl")
        pkl.dump(wikipage, target_file.open('wb'))
Exemple #2
0
def get_filtered_complete_dic(pkl_with_stats_fn,
                              min_paragraphs=5,
                              min_len_paragraphs=500,
                              max_len_paragraphs=1000,
                              draft=False,
                              homonym=False,
                              years=False,
                              wiki_path=None,
                              clean_duplicates=False):
    with open(pkl_with_stats_fn, 'rb') as f:
        stats_uncleaned = pkl.load(f)

    # We filter out the sections errors
    stats = {
        key: stats_uncleaned[key]
        for key in stats_uncleaned if stats_uncleaned[key] != 'SectionError'
    }

    filtered_stats = filter_dic(stats,
                                min_len_paragraphs=min_len_paragraphs,
                                draft=draft,
                                homonym=homonym,
                                max_len_paragraphs=max_len_paragraphs)
    filtered_stats = filter_min_paras(filtered_stats, min_paragraphs)

    # We filter the years

    if clean_duplicates:
        if wiki_path is None:
            print("Error : give a wikipath for duplicates cleaning")
            return
        new_ft_stats = {}
        wiki_obj = Wikipedia('fr')
        for filename, stats in filtered_stats.items():
            try:
                with open(wiki_path + '/' + filename, 'rb') as f:
                    page = pkl.load(f)
            except FileNotFoundError:
                print("Not found :" + filename)
                continue
            page_info = wiki_obj.info(page)
            new_title = title = page_info.title
            new_title = new_title.replace(' ', '_')
            new_title += '.pkl'
            new_ft_stats[new_title] = stats
        filtered_stats = new_ft_stats

    if not years:
        print("Length before year fitering :", len(filtered_stats))
        if wiki_path is None:
            filtered_stats = {
                filename: filtered_stats[filename]
                for filename in filtered_stats
                if filter_years_articles(filename)
            }
        else:
            filtered_stats = {
                filename: filtered_stats[filename]
                for filename in filtered_stats
                if filter_years_articles(wiki_path + filename)
            }
    print("Final length : ", len(filtered_stats))
    return filtered_stats