temp=[title,price,price_original,color_count,product_href] queue_for_result.put(temp) print(temp) time.sleep(abs(random.gauss(4,1))) def run(self): print('begin') self.get_info() if __name__=='__main__': queue_for_result=Queue(0) # 网址生成 url_list=gen_url() # 配置线程数 thread_count=10 url_list=list_split.list_split(url_list,thread_count) Get_productInfo_thread=[] for item in url_list: Get_productInfo_thread.append(Get_productInfo(url_list=item)) for item in Get_productInfo_thread: item.start() for item in Get_productInfo_thread: item.join() # 结果持久化 # 结果提取及标题处理 result=[] for i in range(queue_for_result.qsize()): result.append(queue_for_result.get()) title=['product_title','price','price_original','color_count','product_href']
except: print('*'*20+u'程序运行失误,已跳过'+'*'*20+url[1]) def run(self): self.get_info() if __name__=='__main__': file_name='d:/spider/aliexpress/aliexpress_contact_href_2015-08-06.csv' href_temp=[] queue_for_result=Queue(0) with open(file_name,'r') as csv_file: reader=csv.reader(csv_file) for row in reader: href_temp.append(row) href_temp=href_temp[1:] href_temp_2=list_split.list_split(href_temp,2) Get_contact_detail_thread=[] for item in href_temp_2: Get_contact_detail_thread.append(Get_contact_detail(item)) for item in Get_contact_detail_thread: item.start() for item in Get_contact_detail_thread: item.join() data=[] for i in range(queue_for_result.qsize()): data.append(queue_for_result.get()) title=['shop_name','contact_page_href','contact_name','department','tel_phone','mobile_phone','fax_num','address','province','city'] writer=My_Csv.Write_Csv(path='D:/spider/aliexpress',name='aliexpress_contact_detail',title=title,result=data) writer.add_title_data() print('*'*20+u'程序运行完毕,请检查数据'+'*'*20)
def run(self): self.get_desc() if __name__=='__main__': queue_href=Queue(0) result_brand=[] # 构造品牌列表网址: temp=''.join(map(chr,range(97,123))) url_list=['http://brand.vip.com/list-'+item+'/' for item in temp] url_list.append('http://brand.vip.com/list-0-9/') # 获取品牌名称数据 print('*'*20+u'正在抓取品牌列表'+'*'*20) thread_count=5 Get_BrandHref_thread=[] url_list=list_split.list_split(url_list,thread_count) for url_s in url_list: Get_BrandHref_thread.append(Get_BrandHref(url_s)) for item in Get_BrandHref_thread: item.start() for item in Get_BrandHref_thread: item.join() for i in range(queue_href.qsize()): result_brand.append(queue_href.get()) writer=My_Csv.Write_Csv(path='D:/spider/vip',name='vip_allbrand_list',result=result_brand) writer.add_title_data(title=['en_brand','ch_brand','href']) # 获取品牌简介及收藏量数据 print('*'*20+u'正在抓取品牌简介'+'*'*20) queue_brandtext=Queue(0) result_brandText=[] thread_count_2=30
# url_list=url_for_spider() # had_get_url_list=os.listdir('d:/spider/aliexpress/src') # had_get_url_list=['http://www.aliexpress.com/wholesale?'+item.split('.txt')[0] for item in had_get_url_list] # url_list=[item for item in url_list if item not in had_get_url_list] # # Get_src_thread=[] # url_list=list_split.list_split(url_list,25) # for item in url_list: # Get_src_thread.append(Get_src(item)) # for item in Get_src_thread: # item.start() # for item in Get_src_thread: # item.join() original_file_list = os.listdir("d:/spider/aliexpress/src") file_list_1 = list_split.list_split(original_file_list, 600) i = 0 for temp in file_list_1: i += 1 print("*" * 15 + "ROUND " + str(i) + "*" * 15) file_list = list_split.list_split(temp, 30) Get_info_thread = [] for item in file_list: Get_info_thread.append(Get_info(item)) for item in Get_info_thread: item.start() for item in Get_info_thread: item.join() save_data() print("+" * 15 + u"文件" + str(i) + u"已保存!" + "*" * 15)
# 当日代理可能未有,需先行调用,避免开启多线程的时候同时抓取代理 def use_proxy(): proxy_port=my_proxy.is_proxy_exists() return proxy_port if __name__=='__main__': queue_for_result=Queue(0) # 返回链接 get_contactHref_d=Get_contactHref(r'D:\spider\alibaba_inter').get_contactHref() # 链接列表切割,为多个线程服务 # 配置项:计划开启线程数 thread_count=10 get_contactHref_split=list_split.list_split(get_contactHref_d,thread_count) # 信息获取 # 配置项,是否使用代理 is_proxy=1 if is_proxy: use_proxy() Get_contactInfo_thread=[] for item in get_contactHref_split: Get_contactInfo_thread.append(Get_contactInfo(contact_href_list=item,proxy=is_proxy)) for item in Get_contactInfo_thread: item.start() for item in Get_contactInfo_thread: item.join()
frame.find_element_by_css_selector(css_source).text] # print frame.find_element_by_css_selector(css_source).text except:result='' return result def run(self): self.get_info() if __name__=='__main__': queue_result=Queue(0) Get_phoneaddr_thread=[] temp_com=xlrd.open_workbook(u'D:/spider/唯品会(VIPShop)商机线索输出_0731.xlsx') table=temp_com.sheet_by_index(0) arr_com=table.col_values(5) arr_com=[item for item in arr_com[1:] if item != u'-'] temp_arr_com=list_split.list_split(arr_com,2) for arr in temp_arr_com: print(len(arr)) for item in temp_arr_com: Get_phoneaddr_thread.append(Get_phoneaddr(item)) for item in Get_phoneaddr_thread: item.start() for item in Get_phoneaddr_thread: item.join() result=[] for i in range(queue_result.qsize()): result.append(queue_result.get()) result=[item for i in result for item in i] writer=My_Csv.Write_Csv(path='D:/spider/vip',name='vip_compang_phoneaddr',result=result) writer.add_title_data(title=[]) print('*'*20+u'程序运行完毕,请检查数据'+'*'*20)