def get_all_links(channel): for num in range(1,151): ip = random.choice(proxy_lists) if channel!="http://bj.ganji.com/shoujihaoma/": get_item_link(header,ip,channel,num) else: get_phone_links(header,ip,num) print("所有商品链接已保存成功!") for url in url_list.find("item_link"): ip = random.choice(proxy_lists) get_item_info(header,ip,url) for url in phNum_list.find("phone_link"): ip = random.choice(proxy_lists) get_phone_info(header,ip,url)
def urls_huifu(): db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y for url in rest_of_urls: is_zhuanzhuan = 'http://zhuanzhuan' in url.split('.')[0] is_oldxiangqingye = 'http://sz' in url.split('.')[0] if is_zhuanzhuan: get_zhuan_info(url) elif is_oldxiangqingye: print(url) get_item_info(url) else: pass
#!/usr/bin/env python # -*- coding:utf-8 -*- """ @author: Jan @software: PyCharm Community Edition @time: 2016/2/15 21:21 """ import time from page_parsing import url_list, item_info # 每5秒查询表的记录数 while True: url_counts = url_list.find().count() info_counts = item_info.find().count() now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print str(url_counts) + ' ' + str(info_counts) + ' ' + str(now_time) time.sleep(5)
import time from page_parsing import url_list # 监控 while True: print(url_list.find().count()) time.sleep(5)
from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list #断点续传(去重实现) db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y # def get_all_links_from(channel): # for i in range(1,100): # get_links_from(channel,i) if __name__ == '__main__': pool = Pool(processes=6) # pool = Pool() pool.map(get_all_links_from, channel_list.split()) pool.close() pool.join()
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from, get_item_info, url_list # 抓取所有类型二手物品的链接 def get_all_link_from(channel): for page in range(1, 101): get_links_from(channel, page) if __name__ == '__main__': pool = Pool() # pool.map(get_all_link_from, channel_list.split()) pool.map(get_item_info, [urls['url'] for urls in url_list.find()])
import time from page_parsing import url_list, item_info while True: print('url_list:', url_list.find().count()) time.sleep(5) print('item_info:', item_info.find().count()) time.sleep(5)
from page_parsing import get_item_info_from,url_list,item_info,get_links_from # ================================================= < <链接去重 > > ===================================================== # 设计思路: # 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info) # 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接 # 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集 # 4.两个集合的 url 相减得出剩下应该抓取的 url 还有哪些 db_urls = [item['url'] for item in url_list.find()] # 用列表解析式装入所有要爬取的链接 index_urls = [item['url'] for item in item_info.find()] # 所引出详情信息数据库中所有的现存的 url 字段 x = set(db_urls) # 转换成集合的数据结构 y = set(index_urls) rest_of_urls = x-y # 相减 # ======================================================================================================================
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from ,url_list,get_item_info,item_info def get_all_links_from(channel): for i in range(1,101): info = get_links_from(channel,i) if info == 'none': break if __name__ =='__main__': # get_all_links_from( 'http://bj.58.com/bijiben/') all_channels = channel_list.split() pool = Pool() # pool.map(get_all_links_from,all_channels) print('url_list.count is :%s'%url_list.count())#88280 all = set([item['url'] for item in url_list.find()]) len1=len(all) print('set url_list count is:%s'%len1) done = set([item['url'] for item in item_info.find()])#可省掉list.append(data)这一步 len2 = len(done) print('set item_info count is:%s'%len2) set_undone = all - done len3= len(set_undone) print('still need to insert count is:%s'%len3) pool.map(get_item_info,set_undone)
#coding:utf8 import time from page_parsing import url_list while True: #死循环 print url_list.find().count() time.sleep(5)
''' 这个文件是用来计数显示给我看的 每5秒查看一次url_list表并显示一共有多少数据 url_list表存放的是商品链接 ''' import time from page_parsing import url_list while True: print('已爬取【58同城】商品链接', end=' ') print((url_list.find()).count(), end=' ') print('条,' + '#每5秒从DB读取一次') time.sleep(5)
# _*_ encoding:utf-8 _*_ __author__ = 'lizhe' __time__ = '2018/04/21 10:32' from page_parsing import url_list import re import time for u in url_list.find( {'students.comments': re.compile('http://cn.58.com/yishu')}): print u # print url_list.find().count()
#用于计数的监控程序: import time from page_parsing import url_list while True: print(url_list.find().count()) #count用于逐个显示 time.sleep(5)
import requests from bs4 import BeautifulSoup from multiprocessing import Pool from chanel_extract import chanel from page_parsing import get_info from page_parsing import get_links_from from page_parsing import url_list def get_all_links_from(chanel): for page_num in range(1, 201): get_links_from(chanel, page_num) if __name__ == '__main__': pool = Pool() #pool.map(get_all_links_from,chanel.split()) pool.map(get_info, [item['url'] for item in url_list.find()])
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from, url_list, get_item_info, item_info def get_all_links_from(channel): for i in range(1, 101): info = get_links_from(channel, i) if info == 'none': break if __name__ == '__main__': # get_all_links_from( 'http://bj.58.com/bijiben/') all_channels = channel_list.split() pool = Pool() # pool.map(get_all_links_from,all_channels) print('url_list.count is :%s' % url_list.count()) #88280 all = set([item['url'] for item in url_list.find()]) len1 = len(all) print('set url_list count is:%s' % len1) done = set([item['url'] for item in item_info.find()]) #可省掉list.append(data)这一步 len2 = len(done) print('set item_info count is:%s' % len2) set_undone = all - done len3 = len(set_undone) print('still need to insert count is:%s' % len3) pool.map(get_item_info, set_undone)
from page_parsing import url_list, get_item_info for item in url_list.find(): get_item_info(item['url'])
def get_all_link(): for link in url_list.find(): url = link['url'] yield url
import time from page_parsing import url_list # while True: # print(url_list.find().count()) # time.sleep(5) print(url_list.find())
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_from,get_item_info from page_parsing import url_list def get_all_links_from(channel): for num in range(1,101): get_links_from(channel,num) if __name__ == '__main__': pool = Pool() #pool.map(get_all_links_from,channel_list.split()) #pool.map(get_item_info,a) pool.map(get_item_info,[item['url'] for item in url_list.find()])
from multiprocessing import Pool from channel_extarct import channel_list from page_parsing import url_list from page_parsing import get_url_link from page_parsing import get_item_info import io import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') # 改变标准输出的默认编码 def get_all_links(channel): for num in range(1, 10): try: get_url_link(channel, num, who_sells=0) get_url_link(channel, num, who_sells=1) except: pass if __name__ == '__main__': pool = Pool() # pool.map(get_all_links, channel_list.split()) try: for item_url in url_list.find(): print(item_url['url']) get_item_info(item_url['url']) except: pass
# _*_ encoding:utf-8 _*_ __author__ = 'lizhe' __time__ = '2018/04/21 10:27' from multiprocessing import Pool from channel_extract import ChannelList from page_parsing import get_links_from, get_list_info, url_list import pymongo client = pymongo.MongoClient("localhost", 27017) ceshi = client.ceshi none_url_list = ceshi.none_url_list def get_all_link_from(channel): for num in range(1, 101): if get_links_from(channel, num) == "meiyou": if (num == 1): none_url_list.insert_one({"channel": channel}) break if __name__ == "__main__": pool = Pool() pool.map(get_links_from, ChannelList.split()) urllist = [] for url in url_list.find(): urllist.append(url["url"]) pool.map(get_list_info, urllist)
# __author__ = 'xjlin' # -*- coding: utf-8 -*- import time from page_parsing import url_list from page_parsing import item_info while True: print(url_list.find().count()) print(item_info.find().count()) time.sleep(5)
from multiprocessing import Pool from channel_extract import all_link from page_parsing import get_link_from, get_item_info_from, url_list if __name__ == '__main__': pool = Pool() # pool.map(get_link_from,all_link)#添加链接 pool.map(get_item_info_from, [i['url'] for i in url_list.find()]) pool.close() pool.join()
from multiprocessing import Pool from channel_extract import channel_list from page_parsing import get_links_form from page_parsing import get_item_info from page_parsing import url_list from page_parsing import item_info def get_all_links_from(channel): for num in range(1, 101): get_links_form(channel, num) def get_all_item(item_url): url = item_url['url'] crb = item_url['crb'] get_item_info(url, crb) if __name__ == '__main__': # pool = Pool() pool = Pool(processes=40) pool.map(get_all_item, url_list.find()) # for url in item_info.find(): # print(url)
from page_parsing import url_list,get_item_info for item in url_list.find(): get_item_info(item['url'])
import time from page_parsing import url_list from page_parsing import item_info while True: # count the number of url_list and item_info print('The number of url list', url_list.find().count()) print('The number of items information', item_info.find().count()) time.sleep(10)
import time from page_parsing import url_list,item_info while True: print('message:',item_info.find().count()) print('URL:',url_list.find().count()) time.sleep(5)
#!/usr/bin/env python #-*- coding: utf-8 -*- from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list db_urls = [item['url'] for item in url_list.find()] index_urls = [item['url'] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x-y def get_all_links_from(channel): for i in range(1, 100): get_links_from(channel, i) if __name__ == '__main__': pool = Pool(processes=6) pool.map(get_all_links_from, channel_list) pool.close() pool.join()
#!/usr/bin/env python # -*- coding: utf-8 -*- from multiprocessing import Pool from page_parsing import get_item_info_from, url_list, item_info, get_links_from from channel_extracing import channel_list db_urls = [item["url"] for item in url_list.find()] index_urls = [item["url"] for item in item_info.find()] x = set(db_urls) y = set(index_urls) rest_of_urls = x - y def get_all_links_from(channel): for i in range(1, 100): get_links_from(channel, i) if __name__ == "__main__": pool = Pool(processes=6) # pool.map(get_all_links_from, channel_list) # 抓取所有商品链接(只需抓一次,此处未支持断点功能) pool.map(get_item_info_from, rest_of_urls) # 抓取商品详情页 pool.close() pool.join()