Esempio n. 1
0
from multiprocessing import Pool
from channel import channel_list
from page_parsing import get_links_from


def get_all_links_from(channel):
    for num in range(1, 101):
        get_links_from(channel, num)


if __name__ == '__main__':
    pool = Pool()
    pool.map(get_all_links_from, channel_list.split())
        'http://bj.58.com/ershoushebei/31020167338937x.shtml?',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Cookie':
        'id58=Cn5lxFhktTCpNxDlYCMMAg==; als=0; bj58_id58s="ekcwdi1XdzdMV1M1ODE2NA=="; gr_user_id=ae910cdd-5030-40dc-aacb-1f30d01c4408; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1484140260; Hm_lvt_e15962162366a86a6229038443847be7=1484120314,1484140297; __autma=253535702.971785459.1484140260.1484140260.1490019419.2; __utma=253535702.855450639.1484120247.1484120247.1490019419.2; __utmz=253535702.1484120247.1.1.utmcsr=changzhi.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/; wmda_uuid=75afb6b4c62fc5b48cad4372e5a0e8b8; wmda_new_uuid=1; wmda_visited_projects=%3B1409632296065; commontopbar_myfeet_tooltip=end; myfeet_tooltip=end; es_ab=0; 58home=bj; nab=WBHUANGYE_122_25049812; ipcity=changzhi%7C%u957F%u6CBB%7C0; city=bj; sessionid=a41844e4-4fd3-4637-9855-d388b355d828; GA_GTID=0d4000f6-0000-17be-e195-ffd5d992b25b; _ga=GA1.2.855450639.1484120247; _gid=GA1.2.1113197810.1502417295; bj58_new_uv=15; final_history=31033011422654%2C31020167338937%2C27881001184687%2C30819729748678%2C30896584095174; commontopbar_city=1%7C%u5317%u4EAC%7Cbj; abtest=WBHUANGYE_122_25049812; 58tj_uuid=088478e3-567f-4e33-89e3-680f2deb0404; new_session=1; new_uv=18; utm_source=; spm=; init_refer=; Hm_lvt_e2d6b2d0ec536275bb1e37b421085803=1502418928,1502445555; Hm_lpvt_e2d6b2d0ec536275bb1e37b421085803=1502456060; gr_session_id_98e5a48d736e5e14=238c5d6e-cff7-4910-9184-aebbebb937a2'
    }
    ID = url.split('/')[-1][:-7]  # 取ID号
    url_js = 'http://jst1.58.com/counter?infoid={}&userid=&uname=&sid=555789872'.format(
        str(ID))
    context = requests.get(url_js, headers=kv3).text
    views = context.split('=')[-1]
    return views


def get_all_link_from(channel):
    whos = [0, 1]
    for i in range(1, 100):
        for who in whos:
            urls = get_links_from(channel, who, i)
            for url in urls:
                get_item_info(who, url, kv)


if __name__ == "__main__":
    channels = channel_list.split('\n')
    channels.remove('')
    pool = Pool()  #多进程
    pool.map(get_all_link_from, channels)
Esempio n. 3
0
def get_all_links(channel):
    for i in range(1,5):
        get_links_from_channel(channel,i)

def get_info(item):
    if item.get('visited') != 0:
        pass
    else:
        get_info_from_url(item['url'])
        url_list.update_one({'_id':item['_id']},{'$set':{'visited':1}},False,False)



if __name__ == '__main__':
    pool = Pool()
    pool.map(get_all_links,channel_list.split())
    # for item in url_list.find():
    #     items = []
    #     print item
    #     items.append(item)
    #     pool.map(get_info,items)
    for item in url_list.find():
        #print item
        pool.apply_async(get_info,(item,))
    pool.close()
    pool.join()


# for i in range(10):
#     url_list.insert_one({'url':'http://baidu.com'.format(str(1)),'visited':0})