コード例 #1
0
ファイル: parser.py プロジェクト: wongalvis/WiktionaryCrawler
def parse(pages):
	spelings = []

	speling_dirpath = "data/speling/%s/%s/" % (config.wiki_lang, config.start_cat)
	pages_dirpath = "data/pages/%s/%s/" % (config.wiki_lang, config.start_cat)
	counter = 0

	for page in pages:
		counter += 1
		pb.update(counter, len(pages))

		if os.path.exists(speling_dirpath + page + ".txt"):
			f = open(speling_dirpath + page + ".txt", 'r')
			speling_list = f.read().strip("\n").split("\n")
			f.close()

			spelings.extend(speling_list)
			continue

		speling_list = parse_page(page)
		f = open(speling_dirpath + page + ".txt", 'w')
		for speling in speling_list:
			f.write(speling + "\n")
		f.close()

		spelings.extend(speling_list)
	
	spelings = [speling for speling in spelings if not speling == ""]
	return spelings
コード例 #2
0
def parse(pages):
	spelings = []

	speling_dirpath = "data/speling/%s/" % config.start_cat
	pages_dirpath = "data/pages/%s/" % config.start_cat
	counter = 0

	for page in pages:
		counter += 1
		pb.update(counter, len(pages))

		if os.path.exists(speling_dirpath + page + ".txt"):
			f = open(speling_dirpath + page + ".txt", 'r')
			speling_list = f.read().strip("\n").split("\n")
			f.close()

			spelings.extend(speling_list)
			continue

		speling_list = parse_page(page)
		f = open(speling_dirpath + page + ".txt", 'w')
		for speling in speling_list:
			f.write(speling + "\n")
		f.close()

		spelings.extend(speling_list)
	
	spelings = [speling for speling in spelings if not speling == ""]
	return spelings
コード例 #3
0
def crawl_pages(subcats):
    dirpath = "data/site/%s/%s/" % (config.wiki_lang, config.start_cat)
    pages = []

    counter = 0
    for subcat in subcats:
        counter += 1
        pb.update(counter, len(subcats))

        subcat_dirpath = dirpath + subcat + "/"
        misc.mkdir_p(subcat_dirpath)

        filepath = subcat_dirpath + "pages.txt"
        if os.path.exists(filepath):
            subcat_pages = misc.read_file(filepath)
        else:
            subcat_pages = get_subcat_pages(subcat)
            misc.write_file(filepath, subcat_pages)

        pages.extend(subcat_pages)

    pages = [
        page for page in pages if not config.page_bl(page) and lang.can(page)
    ]
    pages = OrderedDict.fromkeys(pages).keys()  # unique
    return pages
コード例 #4
0
ファイル: crawler.py プロジェクト: Leeyp/WiktionaryCrawler
def crawl_all_pages(pages):
	dirpath = "data/pages/%s/" % config.start_cat

	counter = 0
	for page in pages:
		counter += 1
		pb.update(counter, len(pages))

		filepath = dirpath + page + ".html"
		if not os.path.exists(filepath):
			htmldoc = get_page(page)

			f = open(filepath, 'w')
			f.write(htmldoc)
			f.close()
コード例 #5
0
def crawl_all_pages(pages):
    dirpath = "data/pages/%s/" % config.start_cat

    counter = 0
    for page in pages:
        counter += 1
        pb.update(counter, len(pages))

        filepath = dirpath + page + ".html"
        if not os.path.exists(filepath):
            htmldoc = get_page(page)

            f = open(filepath, 'w')
            f.write(htmldoc)
            f.close()
コード例 #6
0
ファイル: crawler.py プロジェクト: Leeyp/WiktionaryCrawler
def crawl_pages(subcats):
	dirpath = "data/site/%s/" % config.start_cat
	pages = []

	counter = 0
	for subcat in subcats:
		counter += 1
		pb.update(counter, len(subcats))

		subcat_dirpath = dirpath + subcat + "/"
		misc.mkdir_p(subcat_dirpath)

		filepath = subcat_dirpath + "pages.txt"
		if os.path.exists(filepath):
			subcat_pages = misc.read_file(filepath)
		else:
			subcat_pages = get_subcat_pages(subcat)
			misc.write_file(filepath, subcat_pages)

		pages.extend(subcat_pages)

	pages = [page for page in pages if lang.can_page(page)]
	pages = OrderedDict.fromkeys(pages).keys() # unique
	return pages