Exemple #1
0
def gen_whole_item_id():
    proxy_list = read_proxy_file()
    proxy_count = len(proxy_list)
    page_url_proxy_count = 0
    if not proxy_list:
        xici_proxy.gen_proxy()
        proxy_list = read_proxy_file()
    ip_port = proxy_list.pop()
    whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url')
    timeout_page_url_filename = os.path.join(PATH, 'log', 'timeout_page_url')
    item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids_2')
    page_url_crawled_filename = os.path.join(PATH, 'log', 'crawled_page_url')
    with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\
    codecs.open(item_id_filename, mode='wb', encoding='utf-8')as item_id_wf,\
    codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\
    codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf:
        for page_url in [
                item.strip() for item in whole_page_url_f.readlines()
        ]:
            page_url_proxy_count += 1
            try:
                if page_url_proxy_count > 2000:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(
                            re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            timeout_url_wf.write(
                                'get new proxy in xici network!\n')
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                        page_url_proxy_count = 0
                http_hanlder = urllib2.ProxyHandler(
                    {'http': 'http://%s' % ip_port})
                opener = urllib2.build_opener(http_hanlder)
                html = opener.open(page_url, timeout=15)
            except urllib2.HTTPError, e:
                if e.getcode() == 403:
                    timeout_url_wf.write('403 error:request forbiddon!!!\n')
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(
                            re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            timeout_url_wf.write(
                                'get new proxy in xici network!\n')
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                    http_hanlder = urllib2.ProxyHandler(
                        {'http': 'http://%s' % ip_port})
                    opener = urllib2.build_opener(http_hanlder)
                    html = opener.open(page_url,
                                       timeout=15).read().decode('gbk')
                else:
                    continue
            except:
Exemple #2
0
def gen_whole_item_id():
    timeout_timestamp = time.strftime('%m%d_timeout_page_url')
    crawled_timestamp = time.strftime('%m%d_crawled_page_url')
    proxy_list = read_proxy_file()
    page_url_proxy_count = 0
    if not proxy_list:
        xici_proxy.gen_proxy()
        proxy_list = read_proxy_file()
    ip_port = proxy_list.pop()
    handle_no_div_pattern = re.compile('no_(item|plist)_div:')
    whole_page_url_filename = os.path.join(PATH, 'log', '0518_timeout_page_url')
    timeout_page_url_filename = os.path.join(PATH, 'log', timeout_timestamp)
    item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids')
    page_url_crawled_filename = os.path.join(PATH, 'log', crawled_timestamp)
    with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\
    codecs.open(item_id_filename, mode='a', encoding='utf-8')as item_id_wf,\
    codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\
    codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf:
        for page_url in [handle_no_div_pattern.sub('', item.strip()) for item in whole_page_url_f.readlines() if item.startswith('no_')]:
            page_url_proxy_count += 1
            try:
                if page_url_proxy_count > 2000:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            timeout_url_wf.write('get new proxy in xici network!\n')
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                        page_url_proxy_count = 0
                http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                opener = urllib2.build_opener(http_hanlder)
                html = opener.open(page_url, timeout=15)
            except urllib2.HTTPError, e:
                if e.getcode() == 403:
                    timeout_url_wf.write('403 error:request forbiddon!!!\n')
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            timeout_url_wf.write('get new proxy in xici network!\n')
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                    http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                    opener = urllib2.build_opener(http_hanlder)
                    html = opener.open(page_url, timeout=15).read().decode('gbk')
                else:
                    continue
            except:
Exemple #3
0
def test_proxy_pop():
    proxy_list = read_proxy_file()
    page_url_proxy_count = 0
    if not proxy_list:
        xici_proxy.gen_proxy()
        proxy_list = read_proxy_file()
    ip_port = proxy_list.pop()
    whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url')
    with codecs.open(whole_page_url_filename, encoding='utf-8') as f:
        for page_url in f.readlines():
            page_url_proxy_count += 1
            try:
                if page_url_proxy_count > 2:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(
                            re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            proxy_list = read_proxy_file()
                    ip_port = proxy_list.pop()
                    page_url_proxy_count = 0
                print page_url_proxy_count
                print ip_port
                time.sleep(0.5)
                # http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                # opener = urllib2.build_opener(http_hanlder)
                # html = opener.open(page_url, timeout=15)
            except urllib2.HTTPError, e:
                if e.getcode() == 403:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(
                            re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                    http_hanlder = urllib2.ProxyHandler(
                        {'http': 'http://%s' % ip_port})
                    opener = urllib2.build_opener(http_hanlder)
                    html = opener.open(page_url,
                                       timeout=15).read().decode('gbk')
                else:
                    continue
            except:
Exemple #4
0
def test_proxy_pop():
    proxy_list = read_proxy_file()
    page_url_proxy_count = 0
    if not proxy_list:
        xici_proxy.gen_proxy()
        proxy_list = read_proxy_file()
    ip_port = proxy_list.pop()
    whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url')
    with codecs.open(whole_page_url_filename, encoding='utf-8') as f:
        for page_url in f.readlines():
            page_url_proxy_count += 1
            try:
                if page_url_proxy_count > 2:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            proxy_list = read_proxy_file()
                    ip_port = proxy_list.pop()
                    page_url_proxy_count = 0
                print page_url_proxy_count
                print ip_port
                time.sleep(0.5)
                # http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                # opener = urllib2.build_opener(http_hanlder)
                # html = opener.open(page_url, timeout=15)
            except urllib2.HTTPError, e:
                if e.getcode() == 403:
                    if not proxy_list:
                        re_read_proxy_list = read_proxy_file()
                        proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
                        if not proxy_list:
                            xici_proxy.gen_proxy()
                            proxy_list = read_proxy_file()
                        ip_port = proxy_list.pop()
                    http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
                    opener = urllib2.build_opener(http_hanlder)
                    html = opener.open(page_url, timeout=15).read().decode('gbk')
                else:
                    continue
            except: