Example #1
0
def main():
    import urlOpen

    text = urlOpen.get_html("http://www.interfax.ru/world/502926")
    text = tags_filter(text)

    with open("parsed3.html", mode='w', encoding='utf-8') as file:
        file.write(text)
    max_char, min_char, line_list = get_list_of_lines(text)

    for i, c in enumerate(line_list):
        level = max_char * 0.4
        if c[0] >= level and not c[2].startswith("Copyright"):
            print(i, c[0], c[1], c[2])

    print("Max chars in line: {}\n"
          "Min chars in line {}\n"
          "Num of lines {}".format(max_char, min_char, len(line_list)))
Example #2
0
    text = urlOpen.get_html("http://www.interfax.ru/world/502926")
    text = tags_filter(text)

    with open("parsed3.html", mode='w', encoding='utf-8') as file:
        file.write(text)
    max_char, min_char, line_list = get_list_of_lines(text)

    for i, c in enumerate(line_list):
        level = max_char * 0.4
        if c[0] >= level and not c[2].startswith("Copyright"):
            print(i, c[0], c[1], c[2])

    print("Max chars in line: {}\n"
          "Min chars in line {}\n"
          "Num of lines {}".format(max_char, min_char, len(line_list)))


if __name__ == "__main__":
    # main()
    import urlOpen

    html_code = urlOpen.get_html("http://ria.ru/world/20160406/1403678547.html")
    if html_code:
        with open("parsed2.html", mode='w', encoding='utf-8') as file:
            file.write(tags_filter_head_and_script(html_code))

        text = get_text_from_html(html_code)

        with open("parsed3.html", mode='w', encoding='utf-8') as file:
            file.write(text)