Beispiel #1
0
def get_haha365_urls():
    list_url = r'http://www.haha365.com/xd_joke/index.htm'
    base_url = r'http://www.haha365.com'
    getter = HtmlSourceGetter()
    html_data = getter.get_uniform_html_source(list_url, coding[1])
    pattern = r'<img src="/Pic/02.gif"><a Class="" target="_blank"  href="(.*?)" >'
    uris = re.findall(pattern, html_data['data'])
    result_urls = []
    for uri in uris:
        result_urls.append(base_url + uri)
    return (result_urls, coding[1])
Beispiel #2
0
def get_raw_html():
    """第一步,取得统一编码的源文件到本地目录下"""
    getter = HtmlSourceGetter()
    (urls, coding) = get_url_list()
    # get data
    html_data = {}
    for url in urls:
        html_data[url] = getter.get_uniform_html_source(url, coding)
    # output raw data
    create_dir(out_source_dir_)
    i = 0
    for url, raw_data in html_data.items():
        if not raw_data['data']:
            continue
        raw_file = open(out_source_dir_ + create_file_name(today, site_name, i + 1, '.ymg_html'), 'w')
        raw_file.write(raw_data['request_url'])
        raw_file.write(u'\n\n')
        raw_file.write(raw_data['data'].replace('\r\n', ''))
        raw_file.close()
        i += 1
    return i