def parse_online_html_provied_zim(filename, edition=None): """Use the urls extracted from ZIM and parse online html""" file = ZimFile(filename=filename) if not edition: print("Edition not provided. Trying to figure out the edition...") import parser.lang_code_conversion as languages edition_lang_code = file.metadata()['language'].decode('utf-8') edition = languages.get_wikt_code_from_iso639_3(edition_lang_code) print("Edition: {}".format(edition)) parser = get_parser(edition) print("Start to uncompress zim file to get the url list...") from zim.extract import yield_url url_list = [ "https://{}.wiktionary.org/wiki/{}".format(edition, url[:-5]) for url in yield_url(file=file) ] print("Got {} urls from the zim file".format(len(url_list))) print(','.join(headers)) for url in url_list: soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def main(): parser = NlParser() # specify a list of URLs to pull data from text_file = open("data/html_nl.txt", "r") url_list = text_file.read().split('\n') for url in url_list: # get all translations from all specified URLs soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def main(): parser = FrParser() # specify a list of URLs to pull data from text_file = open("data/html_fr.txt", "r") url_list = text_file.read().split('\n') for url in url_list: # get all translations from all specified URLs soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def test_html(filename, edition=None): with open(filename) as file: url_list = file.read().splitlines() if edition is None: edition = infer_edition_from_url(url_list[0]) parser = get_parser(edition) print(','.join(headers)) for url in url_list: soup = get_html_tree_from_url(url) parser.parse_page(soup)
def test_html(filename, edition=None): with open(filename) as file: url_list = file.read().splitlines() if edition is None: edition = infer_edition_from_url(url_list[0]) parser = get_parser(edition) print(','.join(headers)) for url in url_list: soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def parse_online_html_provided_url_list(filename, edition=None): """Use the url list provided and parse online html""" with open(filename) as file: url_list = file.read().splitlines() if not edition: # print("Edition not provided. Trying to figure out the edition...") edition = infer_edition_from_url(url_list[0]) # print("Edition: {}".format(edition)) parser = get_parser(edition) print(','.join(headers)) for url in url_list: soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def parse_online_html_provied_zim(filename, edition=None): """Use the urls extracted from ZIM and parse online html""" file = ZimFile(filename=filename) if not edition: print("Edition not provided. Trying to figure out the edition...") import parser.lang_code_conversion as languages edition_lang_code = file.metadata()['language'].decode('utf-8') edition = languages.get_wikt_code_from_iso639_3(edition_lang_code) print("Edition: {}".format(edition)) parser = get_parser(edition) print("Start to uncompress zim file to get the url list...") from zim.extract import yield_url url_list = ["https://{}.wiktionary.org/wiki/{}".format(edition, url[:-5]) for url in yield_url(file=file)] print("Got {} urls from the zim file".format(len(url_list))) print(','.join(headers)) for url in url_list: soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def main(): parser = DeParser() for url in parser.tested_url: soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def main(): parser = ViParser() for url in parser.tested_url: soup = get_html_tree_from_url(url) for tup in parser.generate_translation_tuples(soup): print(','.join(tup))
def test_html(): for url in tested_url: edition = get_edition_from_url(url) soup = get_html_tree_from_url(url) for tup in parsers[edition](soup): print(",".join(tup))