def gen_whole_item_id(): proxy_list = read_proxy_file() proxy_count = len(proxy_list) page_url_proxy_count = 0 if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url') timeout_page_url_filename = os.path.join(PATH, 'log', 'timeout_page_url') item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids_2') page_url_crawled_filename = os.path.join(PATH, 'log', 'crawled_page_url') with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\ codecs.open(item_id_filename, mode='wb', encoding='utf-8')as item_id_wf,\ codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\ codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf: for page_url in [ item.strip() for item in whole_page_url_f.readlines() ]: page_url_proxy_count += 1 try: if page_url_proxy_count > 2000: if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy( re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() timeout_url_wf.write( 'get new proxy in xici network!\n') proxy_list = read_proxy_file() ip_port = proxy_list.pop() page_url_proxy_count = 0 http_hanlder = urllib2.ProxyHandler( {'http': 'http://%s' % ip_port}) opener = urllib2.build_opener(http_hanlder) html = opener.open(page_url, timeout=15) except urllib2.HTTPError, e: if e.getcode() == 403: timeout_url_wf.write('403 error:request forbiddon!!!\n') if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy( re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() timeout_url_wf.write( 'get new proxy in xici network!\n') proxy_list = read_proxy_file() ip_port = proxy_list.pop() http_hanlder = urllib2.ProxyHandler( {'http': 'http://%s' % ip_port}) opener = urllib2.build_opener(http_hanlder) html = opener.open(page_url, timeout=15).read().decode('gbk') else: continue except:
def gen_whole_item_id(): timeout_timestamp = time.strftime('%m%d_timeout_page_url') crawled_timestamp = time.strftime('%m%d_crawled_page_url') proxy_list = read_proxy_file() page_url_proxy_count = 0 if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() handle_no_div_pattern = re.compile('no_(item|plist)_div:') whole_page_url_filename = os.path.join(PATH, 'log', '0518_timeout_page_url') timeout_page_url_filename = os.path.join(PATH, 'log', timeout_timestamp) item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids') page_url_crawled_filename = os.path.join(PATH, 'log', crawled_timestamp) with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\ codecs.open(item_id_filename, mode='a', encoding='utf-8')as item_id_wf,\ codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\ codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf: for page_url in [handle_no_div_pattern.sub('', item.strip()) for item in whole_page_url_f.readlines() if item.startswith('no_')]: page_url_proxy_count += 1 try: if page_url_proxy_count > 2000: if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() timeout_url_wf.write('get new proxy in xici network!\n') proxy_list = read_proxy_file() ip_port = proxy_list.pop() page_url_proxy_count = 0 http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) opener = urllib2.build_opener(http_hanlder) html = opener.open(page_url, timeout=15) except urllib2.HTTPError, e: if e.getcode() == 403: timeout_url_wf.write('403 error:request forbiddon!!!\n') if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() timeout_url_wf.write('get new proxy in xici network!\n') proxy_list = read_proxy_file() ip_port = proxy_list.pop() http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) opener = urllib2.build_opener(http_hanlder) html = opener.open(page_url, timeout=15).read().decode('gbk') else: continue except:
def test_proxy_pop(): proxy_list = read_proxy_file() page_url_proxy_count = 0 if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url') with codecs.open(whole_page_url_filename, encoding='utf-8') as f: for page_url in f.readlines(): page_url_proxy_count += 1 try: if page_url_proxy_count > 2: if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy( re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() page_url_proxy_count = 0 print page_url_proxy_count print ip_port time.sleep(0.5) # http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) # opener = urllib2.build_opener(http_hanlder) # html = opener.open(page_url, timeout=15) except urllib2.HTTPError, e: if e.getcode() == 403: if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy( re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() http_hanlder = urllib2.ProxyHandler( {'http': 'http://%s' % ip_port}) opener = urllib2.build_opener(http_hanlder) html = opener.open(page_url, timeout=15).read().decode('gbk') else: continue except:
def test_proxy_pop(): proxy_list = read_proxy_file() page_url_proxy_count = 0 if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() whole_page_url_filename = os.path.join(PATH, 'sys', 'whole_page_url') with codecs.open(whole_page_url_filename, encoding='utf-8') as f: for page_url in f.readlines(): page_url_proxy_count += 1 try: if page_url_proxy_count > 2: if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() page_url_proxy_count = 0 print page_url_proxy_count print ip_port time.sleep(0.5) # http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) # opener = urllib2.build_opener(http_hanlder) # html = opener.open(page_url, timeout=15) except urllib2.HTTPError, e: if e.getcode() == 403: if not proxy_list: re_read_proxy_list = read_proxy_file() proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list) if not proxy_list: xici_proxy.gen_proxy() proxy_list = read_proxy_file() ip_port = proxy_list.pop() http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port}) opener = urllib2.build_opener(http_hanlder) html = opener.open(page_url, timeout=15).read().decode('gbk') else: continue except: