Example #1
0
def get_all_links(channel):
    for num in range(1,151):
        ip = random.choice(proxy_lists)
        if channel!="http://bj.ganji.com/shoujihaoma/":
            get_item_link(header,ip,channel,num)
        else:
            get_phone_links(header,ip,num)
    print("所有商品链接已保存成功!")
    for url in url_list.find("item_link"):
        ip = random.choice(proxy_lists)
        get_item_info(header,ip,url)
    for url in phNum_list.find("phone_link"):
        ip = random.choice(proxy_lists)
        get_phone_info(header,ip,url)
Example #2
0
def urls_huifu():
    db_urls = [item['url'] for item in url_list.find()]
    index_urls = [item['url'] for item in item_info.find()]
    x = set(db_urls)
    y = set(index_urls)
    rest_of_urls = x - y

    for url in rest_of_urls:
        is_zhuanzhuan = 'http://zhuanzhuan' in url.split('.')[0]
        is_oldxiangqingye = 'http://sz' in url.split('.')[0]
        if is_zhuanzhuan:
            get_zhuan_info(url)
        elif is_oldxiangqingye:
            print(url)
            get_item_info(url)
        else:
            pass
Example #3
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
@author: Jan
@software: PyCharm Community Edition
@time: 2016/2/15 21:21
"""

import time
from page_parsing import url_list, item_info

# 每5秒查询表的记录数
while True:
    url_counts = url_list.find().count()
    info_counts = item_info.find().count()
    now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print str(url_counts) + '  ' + str(info_counts) + '  ' + str(now_time)
    time.sleep(5)
Example #4
0
import time
from page_parsing import url_list
# 监控

while True:
    print(url_list.find().count())
    time.sleep(5)
Example #5
0
from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

#断点续传(去重实现)
db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x - y

# def get_all_links_from(channel):
#     for i in range(1,100):
#         get_links_from(channel,i)

if __name__ == '__main__':
    pool = Pool(processes=6)
    # pool = Pool()
    pool.map(get_all_links_from, channel_list.split())
    pool.close()
    pool.join()
Example #6
0
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from, get_item_info, url_list


# 抓取所有类型二手物品的链接
def get_all_link_from(channel):
    for page in range(1, 101):
        get_links_from(channel, page)


if __name__ == '__main__':
    pool = Pool()
    # pool.map(get_all_link_from, channel_list.split())
    pool.map(get_item_info, [urls['url'] for urls in url_list.find()])
Example #7
0
import time
from page_parsing import url_list, item_info


while True:
    print('url_list:', url_list.find().count())
    time.sleep(5)
    print('item_info:', item_info.find().count())
    time.sleep(5)
Example #8
0
    from page_parsing import get_item_info_from,url_list,item_info,get_links_from


    # ================================================= < <链接去重 > > =====================================================

    # 设计思路:
    # 1.分两个数据库,第一个用于只用于存放抓取下来的 url (ulr_list);第二个则储存 url 对应的物品详情信息(item_info)
    # 2.在抓取过程中在第二个数据库中写入数据的同时,新增一个字段(key) 'index_url' 即该详情对应的链接
    # 3.若抓取中断,在第二个存放详情页信息的数据库中的 url 字段应该是第一个数据库中 url 集合的子集
    # 4.两个集合的 url 相减得出剩下应该抓取的 url 还有哪些


    db_urls = [item['url'] for item in url_list.find()]     # 用列表解析式装入所有要爬取的链接
    index_urls = [item['url'] for item in item_info.find()] # 所引出详情信息数据库中所有的现存的 url 字段
    x = set(db_urls)                                        # 转换成集合的数据结构
    y = set(index_urls)
    rest_of_urls = x-y                                      # 相减

    # ======================================================================================================================




Example #9
0
File: main.py Project: qchs/58
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from ,url_list,get_item_info,item_info


def get_all_links_from(channel):
    for i in range(1,101):
        info = get_links_from(channel,i)
        if info == 'none':
            break

if __name__ =='__main__':
    # get_all_links_from( 'http://bj.58.com/bijiben/')
    all_channels = channel_list.split()
    pool = Pool()
    # pool.map(get_all_links_from,all_channels)
    print('url_list.count is :%s'%url_list.count())#88280

    all = set([item['url'] for item in url_list.find()])
    len1=len(all)
    print('set url_list count is:%s'%len1)
    done =  set([item['url'] for item in item_info.find()])#可省掉list.append(data)这一步
    len2 = len(done)
    print('set item_info count is:%s'%len2)

    set_undone = all - done
    len3= len(set_undone)
    print('still need to insert count is:%s'%len3)
    pool.map(get_item_info,set_undone)
Example #10
0
#coding:utf8
import time
from page_parsing import url_list

while True:  #死循环
    print url_list.find().count()
    time.sleep(5)
Example #11
0
'''
这个文件是用来计数显示给我看的
每5秒查看一次url_list表并显示一共有多少数据
url_list表存放的是商品链接
'''

import time
from page_parsing import url_list

while True:
    print('已爬取【58同城】商品链接', end=' ')
    print((url_list.find()).count(), end=' ')
    print('条,' + '#每5秒从DB读取一次')
    time.sleep(5)
Example #12
0
# _*_ encoding:utf-8 _*_
__author__ = 'lizhe'
__time__ = '2018/04/21 10:32'
from page_parsing import url_list
import re
import time
for u in url_list.find(
    {'students.comments': re.compile('http://cn.58.com/yishu')}):
    print u
# print url_list.find().count()
Example #13
0
#用于计数的监控程序:
import time
from page_parsing import url_list

while True:
    print(url_list.find().count())  #count用于逐个显示
    time.sleep(5)
Example #14
0
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
from chanel_extract import chanel
from page_parsing import get_info
from page_parsing import get_links_from
from page_parsing import url_list


def get_all_links_from(chanel):
    for page_num in range(1, 201):
        get_links_from(chanel, page_num)


if __name__ == '__main__':
    pool = Pool()
    #pool.map(get_all_links_from,chanel.split())
    pool.map(get_info, [item['url'] for item in url_list.find()])
Example #15
0
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from, url_list, get_item_info, item_info


def get_all_links_from(channel):
    for i in range(1, 101):
        info = get_links_from(channel, i)
        if info == 'none':
            break


if __name__ == '__main__':
    # get_all_links_from( 'http://bj.58.com/bijiben/')
    all_channels = channel_list.split()
    pool = Pool()
    # pool.map(get_all_links_from,all_channels)
    print('url_list.count is :%s' % url_list.count())  #88280

    all = set([item['url'] for item in url_list.find()])
    len1 = len(all)
    print('set url_list count is:%s' % len1)
    done = set([item['url']
                for item in item_info.find()])  #可省掉list.append(data)这一步
    len2 = len(done)
    print('set item_info count is:%s' % len2)

    set_undone = all - done
    len3 = len(set_undone)
    print('still need to insert count is:%s' % len3)
    pool.map(get_item_info, set_undone)
Example #16
0
from page_parsing import url_list, get_item_info

for item in url_list.find():
    get_item_info(item['url'])
Example #17
0
def get_all_link():
    for link in url_list.find():
        url = link['url']

        yield url
Example #18
0
import time
from page_parsing import url_list

# while True:
#     print(url_list.find().count())
#     time.sleep(5)

print(url_list.find())
Example #19
0
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from,get_item_info
from page_parsing import url_list

def get_all_links_from(channel):
    for num in range(1,101):
        get_links_from(channel,num)



if __name__ == '__main__':
    pool = Pool()
    #pool.map(get_all_links_from,channel_list.split())
    #pool.map(get_item_info,a)
    pool.map(get_item_info,[item['url'] for item in url_list.find()])
Example #20
0
from multiprocessing import Pool
from channel_extarct import channel_list
from page_parsing import url_list
from page_parsing import get_url_link
from page_parsing import get_item_info
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,
                              encoding='utf8')  # 改变标准输出的默认编码


def get_all_links(channel):
    for num in range(1, 10):
        try:
            get_url_link(channel, num, who_sells=0)
            get_url_link(channel, num, who_sells=1)
        except:
            pass


if __name__ == '__main__':
    pool = Pool()
    # pool.map(get_all_links, channel_list.split())
    try:
        for item_url in url_list.find():
            print(item_url['url'])
            get_item_info(item_url['url'])
    except:
        pass
Example #21
0
# _*_ encoding:utf-8 _*_
__author__ = 'lizhe'
__time__ = '2018/04/21 10:27'
from multiprocessing import Pool
from channel_extract import ChannelList
from page_parsing import get_links_from, get_list_info, url_list
import pymongo

client = pymongo.MongoClient("localhost", 27017)
ceshi = client.ceshi
none_url_list = ceshi.none_url_list


def get_all_link_from(channel):
    for num in range(1, 101):
        if get_links_from(channel, num) == "meiyou":
            if (num == 1):
                none_url_list.insert_one({"channel": channel})
            break


if __name__ == "__main__":
    pool = Pool()
    pool.map(get_links_from, ChannelList.split())
    urllist = []
    for url in url_list.find():
        urllist.append(url["url"])
    pool.map(get_list_info, urllist)
Example #22
0
# __author__ = 'xjlin'
# -*- coding: utf-8 -*-
import time
from page_parsing import url_list
from page_parsing import  item_info

while True:
    print(url_list.find().count())
    print(item_info.find().count())
    time.sleep(5)
Example #23
0
from multiprocessing import Pool
from channel_extract import all_link
from page_parsing import get_link_from, get_item_info_from, url_list

if __name__ == '__main__':
    pool = Pool()
    # pool.map(get_link_from,all_link)#添加链接
    pool.map(get_item_info_from, [i['url'] for i in url_list.find()])
    pool.close()
    pool.join()
Example #24
0
from multiprocessing import Pool
from channel_extract import channel_list
from  page_parsing import get_links_form
from  page_parsing import get_item_info
from  page_parsing import url_list
from  page_parsing import item_info


def get_all_links_from(channel):
    for num in range(1, 101):
        get_links_form(channel, num)


def get_all_item(item_url):
    url = item_url['url']
    crb = item_url['crb']
    get_item_info(url, crb)


if __name__ == '__main__':
    # pool = Pool()
    pool = Pool(processes=40)
    pool.map(get_all_item, url_list.find())
# for url in item_info.find():
#     print(url)
Example #25
0
from  page_parsing import url_list,get_item_info

for item in url_list.find():
    get_item_info(item['url'])
Example #26
0
import time
from page_parsing import url_list
from page_parsing import item_info

while True:
    # count the number of url_list and item_info
    print('The number of url list', url_list.find().count())
    print('The number of items information', item_info.find().count())
    time.sleep(10)
Example #27
0
import time
from page_parsing import url_list,item_info

while True:
    print('message:',item_info.find().count())
    print('URL:',url_list.find().count())
    time.sleep(5)
Example #28
0
#!/usr/bin/env python
#-*- coding: utf-8 -*-

from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y

def get_all_links_from(channel):
     for i in range(1, 100):
         get_links_from(channel, i)


if __name__ == '__main__':
    pool = Pool(processes=6)
    pool.map(get_all_links_from, channel_list)
    pool.close()
    pool.join()
Example #29
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: Jan
@software: PyCharm Community Edition
@time: 2016/2/15 21:21
"""

import time
from page_parsing import url_list, item_info

# 每5秒查询表的记录数
while True:
    url_counts = url_list.find().count()
    info_counts = item_info.find().count()
    now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print str(url_counts) + '  ' + str(info_counts) + '  ' + str(now_time)
    time.sleep(5)
Example #30
0
from multiprocessing import Pool
from channel_extarct import channel_list
from page_parsing import url_list
from page_parsing import get_url_link
from page_parsing import get_item_info
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')     # 改变标准输出的默认编码

def get_all_links(channel):
    for num in range(1, 10):
        try:
            get_url_link(channel, num, who_sells=0)
            get_url_link(channel, num, who_sells=1)
        except:
            pass



if __name__ == '__main__':
    pool = Pool()
    # pool.map(get_all_links, channel_list.split())
    try:
        for item_url in url_list.find():
            print(item_url['url'])
            get_item_info(item_url['url'])
    except:
        pass

Example #31
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from multiprocessing import Pool
from page_parsing import get_item_info_from, url_list, item_info, get_links_from
from channel_extracing import channel_list

db_urls = [item["url"] for item in url_list.find()]
index_urls = [item["url"] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x - y


def get_all_links_from(channel):
    for i in range(1, 100):
        get_links_from(channel, i)


if __name__ == "__main__":
    pool = Pool(processes=6)
    # pool.map(get_all_links_from, channel_list) # 抓取所有商品链接(只需抓一次,此处未支持断点功能)
    pool.map(get_item_info_from, rest_of_urls)  # 抓取商品详情页
    pool.close()
    pool.join()