Example #1
0
                temp=[title,price,price_original,color_count,product_href]
                queue_for_result.put(temp)
                print(temp)
            time.sleep(abs(random.gauss(4,1)))

    def run(self):
        print('begin')
        self.get_info()

if __name__=='__main__':
    queue_for_result=Queue(0)
    # 网址生成
    url_list=gen_url()
    # 配置线程数
    thread_count=10
    url_list=list_split.list_split(url_list,thread_count)

    Get_productInfo_thread=[]
    for item in url_list:
        Get_productInfo_thread.append(Get_productInfo(url_list=item))
    for item in Get_productInfo_thread:
        item.start()
    for item in Get_productInfo_thread:
        item.join()

    # 结果持久化
    # 结果提取及标题处理
    result=[]
    for i in range(queue_for_result.qsize()):
        result.append(queue_for_result.get())
    title=['product_title','price','price_original','color_count','product_href']
Example #2
0
            except:
                print('*'*20+u'程序运行失误,已跳过'+'*'*20+url[1])

    def run(self):
        self.get_info()

if __name__=='__main__':
    file_name='d:/spider/aliexpress/aliexpress_contact_href_2015-08-06.csv'
    href_temp=[]
    queue_for_result=Queue(0)
    with open(file_name,'r') as csv_file:
        reader=csv.reader(csv_file)
        for row in reader:
            href_temp.append(row)
    href_temp=href_temp[1:]
    href_temp_2=list_split.list_split(href_temp,2)
    Get_contact_detail_thread=[]
    for item in href_temp_2:
        Get_contact_detail_thread.append(Get_contact_detail(item))
    for item in Get_contact_detail_thread:
        item.start()
    for item in Get_contact_detail_thread:
        item.join()

    data=[]
    for i in range(queue_for_result.qsize()):
        data.append(queue_for_result.get())
    title=['shop_name','contact_page_href','contact_name','department','tel_phone','mobile_phone','fax_num','address','province','city']
    writer=My_Csv.Write_Csv(path='D:/spider/aliexpress',name='aliexpress_contact_detail',title=title,result=data)
    writer.add_title_data()
    print('*'*20+u'程序运行完毕,请检查数据'+'*'*20)
Example #3
0
    def run(self):
        self.get_desc()

if __name__=='__main__':
    queue_href=Queue(0)
    result_brand=[]
    # 构造品牌列表网址:
    temp=''.join(map(chr,range(97,123)))
    url_list=['http://brand.vip.com/list-'+item+'/' for item in temp]
    url_list.append('http://brand.vip.com/list-0-9/')
    # 获取品牌名称数据
    print('*'*20+u'正在抓取品牌列表'+'*'*20)
    thread_count=5
    Get_BrandHref_thread=[]
    url_list=list_split.list_split(url_list,thread_count)
    for url_s in url_list:
        Get_BrandHref_thread.append(Get_BrandHref(url_s))
    for item in Get_BrandHref_thread:
        item.start()
    for item in Get_BrandHref_thread:
        item.join()
    for i in range(queue_href.qsize()):
        result_brand.append(queue_href.get())
    writer=My_Csv.Write_Csv(path='D:/spider/vip',name='vip_allbrand_list',result=result_brand)
    writer.add_title_data(title=['en_brand','ch_brand','href'])
    # 获取品牌简介及收藏量数据
    print('*'*20+u'正在抓取品牌简介'+'*'*20)
    queue_brandtext=Queue(0)
    result_brandText=[]
    thread_count_2=30
Example #4
0
    # url_list=url_for_spider()
    # had_get_url_list=os.listdir('d:/spider/aliexpress/src')
    # had_get_url_list=['http://www.aliexpress.com/wholesale?'+item.split('.txt')[0] for item in had_get_url_list]
    # url_list=[item for item in url_list if item not in had_get_url_list]
    #
    # Get_src_thread=[]
    # url_list=list_split.list_split(url_list,25)
    # for item in url_list:
    #     Get_src_thread.append(Get_src(item))
    # for item in Get_src_thread:
    #     item.start()
    # for item in Get_src_thread:
    #     item.join()

    original_file_list = os.listdir("d:/spider/aliexpress/src")
    file_list_1 = list_split.list_split(original_file_list, 600)
    i = 0
    for temp in file_list_1:
        i += 1
        print("*" * 15 + "ROUND " + str(i) + "*" * 15)
        file_list = list_split.list_split(temp, 30)
        Get_info_thread = []
        for item in file_list:
            Get_info_thread.append(Get_info(item))
        for item in Get_info_thread:
            item.start()
        for item in Get_info_thread:
            item.join()
        save_data()

        print("+" * 15 + u"文件" + str(i) + u"已保存!" + "*" * 15)
# 当日代理可能未有,需先行调用,避免开启多线程的时候同时抓取代理
def use_proxy():
    proxy_port=my_proxy.is_proxy_exists()
    return proxy_port

if __name__=='__main__':
    queue_for_result=Queue(0)

    # 返回链接
    get_contactHref_d=Get_contactHref(r'D:\spider\alibaba_inter').get_contactHref()

    # 链接列表切割,为多个线程服务
    # 配置项:计划开启线程数
    thread_count=10
    get_contactHref_split=list_split.list_split(get_contactHref_d,thread_count)

    # 信息获取
    # 配置项,是否使用代理
    is_proxy=1
    if is_proxy:
        use_proxy()

    Get_contactInfo_thread=[]
    for item in get_contactHref_split:
        Get_contactInfo_thread.append(Get_contactInfo(contact_href_list=item,proxy=is_proxy))
    for item in Get_contactInfo_thread:
        item.start()
    for item in Get_contactInfo_thread:
        item.join()
Example #6
0
                    frame.find_element_by_css_selector(css_source).text]
            # print frame.find_element_by_css_selector(css_source).text
        except:result=''
        return result

    def run(self):
        self.get_info()

if __name__=='__main__':
    queue_result=Queue(0)
    Get_phoneaddr_thread=[]
    temp_com=xlrd.open_workbook(u'D:/spider/唯品会(VIPShop)商机线索输出_0731.xlsx')
    table=temp_com.sheet_by_index(0)
    arr_com=table.col_values(5)
    arr_com=[item for item in arr_com[1:] if item != u'-']
    temp_arr_com=list_split.list_split(arr_com,2)
    for arr in temp_arr_com:
        print(len(arr))
    for item in temp_arr_com:
        Get_phoneaddr_thread.append(Get_phoneaddr(item))
    for item in Get_phoneaddr_thread:
        item.start()
    for item in Get_phoneaddr_thread:
        item.join()
    result=[]
    for i in range(queue_result.qsize()):
        result.append(queue_result.get())
    result=[item for i in result for item in i]
    writer=My_Csv.Write_Csv(path='D:/spider/vip',name='vip_compang_phoneaddr',result=result)
    writer.add_title_data(title=[])
    print('*'*20+u'程序运行完毕,请检查数据'+'*'*20)