def parse(pages): spelings = [] speling_dirpath = "data/speling/%s/%s/" % (config.wiki_lang, config.start_cat) pages_dirpath = "data/pages/%s/%s/" % (config.wiki_lang, config.start_cat) counter = 0 for page in pages: counter += 1 pb.update(counter, len(pages)) if os.path.exists(speling_dirpath + page + ".txt"): f = open(speling_dirpath + page + ".txt", 'r') speling_list = f.read().strip("\n").split("\n") f.close() spelings.extend(speling_list) continue speling_list = parse_page(page) f = open(speling_dirpath + page + ".txt", 'w') for speling in speling_list: f.write(speling + "\n") f.close() spelings.extend(speling_list) spelings = [speling for speling in spelings if not speling == ""] return spelings
def parse(pages): spelings = [] speling_dirpath = "data/speling/%s/" % config.start_cat pages_dirpath = "data/pages/%s/" % config.start_cat counter = 0 for page in pages: counter += 1 pb.update(counter, len(pages)) if os.path.exists(speling_dirpath + page + ".txt"): f = open(speling_dirpath + page + ".txt", 'r') speling_list = f.read().strip("\n").split("\n") f.close() spelings.extend(speling_list) continue speling_list = parse_page(page) f = open(speling_dirpath + page + ".txt", 'w') for speling in speling_list: f.write(speling + "\n") f.close() spelings.extend(speling_list) spelings = [speling for speling in spelings if not speling == ""] return spelings
def crawl_pages(subcats): dirpath = "data/site/%s/%s/" % (config.wiki_lang, config.start_cat) pages = [] counter = 0 for subcat in subcats: counter += 1 pb.update(counter, len(subcats)) subcat_dirpath = dirpath + subcat + "/" misc.mkdir_p(subcat_dirpath) filepath = subcat_dirpath + "pages.txt" if os.path.exists(filepath): subcat_pages = misc.read_file(filepath) else: subcat_pages = get_subcat_pages(subcat) misc.write_file(filepath, subcat_pages) pages.extend(subcat_pages) pages = [ page for page in pages if not config.page_bl(page) and lang.can(page) ] pages = OrderedDict.fromkeys(pages).keys() # unique return pages
def crawl_all_pages(pages): dirpath = "data/pages/%s/" % config.start_cat counter = 0 for page in pages: counter += 1 pb.update(counter, len(pages)) filepath = dirpath + page + ".html" if not os.path.exists(filepath): htmldoc = get_page(page) f = open(filepath, 'w') f.write(htmldoc) f.close()
def crawl_pages(subcats): dirpath = "data/site/%s/" % config.start_cat pages = [] counter = 0 for subcat in subcats: counter += 1 pb.update(counter, len(subcats)) subcat_dirpath = dirpath + subcat + "/" misc.mkdir_p(subcat_dirpath) filepath = subcat_dirpath + "pages.txt" if os.path.exists(filepath): subcat_pages = misc.read_file(filepath) else: subcat_pages = get_subcat_pages(subcat) misc.write_file(filepath, subcat_pages) pages.extend(subcat_pages) pages = [page for page in pages if lang.can_page(page)] pages = OrderedDict.fromkeys(pages).keys() # unique return pages