Exemple #1
0
def step2():
    '''
    得到数据
    :return:
    '''

    # 读取上一阶段保存的列表
    fd = open('chuzu_list.txt', 'r')
    all_text = fd.read()
    fd.close()
    # 列表处理
    L = all_text.replace('[', '')
    L = L.replace(']', '')
    L = L.replace('\'', '')
    chuzu_list = L.split(",")
    # 出租房页面列表
    chuzu_list = list(set(chuzu_list))
    # 已有数据列表
    done_list = list_check.done_lis()
    print(done_list)
    # 出错列表
    error_list = []

    # 详情页处理
    item_pages = []
    random.shuffle(chuzu_list)
    for i in chuzu_list:
        # 显示当前页面地址
        print i
        it_urls = get_itempage_url.get_url(i)
        time.sleep(numpy.random.randint(3, 6))
        # 详情获取和保存
        for url in it_urls:
            if url not in done_list:
                if 'e.58.com' in url:
                    print('无效地址。。下一个')

                elif 'jxjump' in url:
                    print('无效地址。。下一个')
                else:
                    if 'short.58.com' in url:
                        url = url.replace('&end=end', '')
                    print(
                        '############################当前地址不在已完成列表中############################'
                    )
                    try:
                        # 得到页面数据
                        city, district, title, rental_type, phone_num, contacts, url_now, rent, lease, area, heading, community, address, detail, facility, advantage, pic = haoitem.get_items(
                            url)
                        # 得到处理后城市名
                        c_name = haoitem.get_cname()
                        # 所在地区和省份
                        region, province = get_city_info.get_areas(c_name)
                        # 保存到json的内容
                        detel = {
                            "region": region,
                            "province": province,
                            "city": city,
                            "district": district,
                            "title": title,
                            "rental_type": rental_type,
                            "url_now": url_now,
                            "rent": rent,
                            "lease": lease,
                            "area": area.replace(' ', ''),
                            "heading": heading,
                            "community": community,
                            'address': address,
                            "contacts": contacts,
                            "phone": phone_num,
                            "detail": detail,
                            "facility": facility,
                            "advantage": advantage,
                            "pics": pic
                        }

                        jStr = json.dumps(detel, ensure_ascii=False, indent=1)
                        IOutils.rtfile_time_with_path(jStr, 'json')
                        write_db.data_in(detel)
                        time.sleep(numpy.random.randint(3, 6))

                    except:
                        print(
                            '########################看来有的页面有问题,触发反爬了,休息片刻########################'
                        )
                        print('#' * 20 + url + '\t' + '#' * 20)
                        error_list.append(url)
                        time.sleep(15)
            else:
                print(
                    '############################页面已经搞过了,下一个############################'
                )
                time.sleep(numpy.random.randint(3, 5))
            # finally:
            #     return item_pages

            if it_urls != None:
                it_pages = item_pages.append(it_urls)

    print it_pages

    # 详情页列表保存
    file_zf = open('zf_item_list.txt', 'w')
    file_zf.write(repr(it_pages))
    file_zf.close()

    file_zf = open('error_list.txt', 'w')
    file_zf.write(repr(error_list))
    file_zf.close()

    return error_list