def parse_online_html_provied_zim(filename, edition=None):
    """Use the urls extracted from ZIM and parse online html"""
    file = ZimFile(filename=filename)
    if not edition:
        print("Edition not provided. Trying to figure out the edition...")
        import parser.lang_code_conversion as languages
        edition_lang_code = file.metadata()['language'].decode('utf-8')
        edition = languages.get_wikt_code_from_iso639_3(edition_lang_code)

    print("Edition: {}".format(edition))
    parser = get_parser(edition)

    print("Start to uncompress zim file to get the url list...")
    from zim.extract import yield_url
    url_list = [
        "https://{}.wiktionary.org/wiki/{}".format(edition, url[:-5])
        for url in yield_url(file=file)
    ]
    print("Got {} urls from the zim file".format(len(url_list)))

    print(','.join(headers))
    for url in url_list:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def main():
    parser = NlParser()

    # specify a list of URLs to pull data from
    text_file = open("data/html_nl.txt", "r")
    url_list = text_file.read().split('\n')
    for url in url_list:    # get all translations from all specified URLs
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
Ejemplo n.º 3
0
def main():
    parser = FrParser()

    # specify a list of URLs to pull data from
    text_file = open("data/html_fr.txt", "r")
    url_list = text_file.read().split('\n')
    for url in url_list:  # get all translations from all specified URLs
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def test_html(filename, edition=None):
    with open(filename) as file:
        url_list = file.read().splitlines()

    if edition is None:
        edition = infer_edition_from_url(url_list[0])
    parser = get_parser(edition)

    print(','.join(headers))
    for url in url_list:
        soup = get_html_tree_from_url(url)
        parser.parse_page(soup)
Ejemplo n.º 5
0
def test_html(filename, edition=None):
    with open(filename) as file:
        url_list = file.read().splitlines()

    if edition is None:
        edition = infer_edition_from_url(url_list[0])
    parser = get_parser(edition)

    print(','.join(headers))
    for url in url_list:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def parse_online_html_provided_url_list(filename, edition=None):
    """Use the url list provided and parse online html"""
    with open(filename) as file:
        url_list = file.read().splitlines()

    if not edition:
        # print("Edition not provided. Trying to figure out the edition...")
        edition = infer_edition_from_url(url_list[0])

    # print("Edition: {}".format(edition))
    parser = get_parser(edition)

    print(','.join(headers))
    for url in url_list:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def parse_online_html_provided_url_list(filename, edition=None):
    """Use the url list provided and parse online html"""
    with open(filename) as file:
        url_list = file.read().splitlines()

    if not edition:
        # print("Edition not provided. Trying to figure out the edition...")
        edition = infer_edition_from_url(url_list[0])

    # print("Edition: {}".format(edition))
    parser = get_parser(edition)

    print(','.join(headers))
    for url in url_list:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def parse_online_html_provied_zim(filename, edition=None):
    """Use the urls extracted from ZIM and parse online html"""
    file = ZimFile(filename=filename)
    if not edition:
        print("Edition not provided. Trying to figure out the edition...")
        import parser.lang_code_conversion as languages
        edition_lang_code = file.metadata()['language'].decode('utf-8')
        edition = languages.get_wikt_code_from_iso639_3(edition_lang_code)

    print("Edition: {}".format(edition))
    parser = get_parser(edition)

    print("Start to uncompress zim file to get the url list...")
    from zim.extract import yield_url
    url_list = ["https://{}.wiktionary.org/wiki/{}".format(edition, url[:-5]) for url in yield_url(file=file)]
    print("Got {} urls from the zim file".format(len(url_list)))

    print(','.join(headers))
    for url in url_list:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def main():
    parser = DeParser()
    for url in parser.tested_url:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def main():
    parser = ViParser()
    for url in parser.tested_url:
        soup = get_html_tree_from_url(url)
        for tup in parser.generate_translation_tuples(soup):
            print(','.join(tup))
def test_html():
    for url in tested_url:
        edition = get_edition_from_url(url)
        soup = get_html_tree_from_url(url)
        for tup in parsers[edition](soup):
            print(",".join(tup))