Beispiel #1
0
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' %
                                        (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(
                        city_url, detail_page_obj)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    #print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" %
                                      (middleman_type, detail_address_url))


if __name__ == '__main__':
    middleman_type = 'woaiwojia'
    logging.basicConfig(
        level=logging.WARNING,
        format=
        '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
        datefmt='%a, %d %b %Y %H:%M:%S',
        filename=get_log_path(middleman_type),
        filemode='w')
    crawl(middleman_type)
Beispiel #2
0
middleman_count_dict = {
    'woaiwojia': 0,
    'maitian': 0,
    'sohujiaodian': 0,
    'anjuke': 0,
    'tuitui99': 0,
    'fangtianxia': 0
}

logging.basicConfig(
    level=logging.INFO,
    format=
    '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',
    filename=get_log_path('middleman'),
    filemode='a')


def update_db():
    conn = connect_to_db()
    for basedir, subdirs, filenames in os.walk(get_result_path()):
        for filename in filenames:
            middleman = os.path.splitext(filename)[0].lstrip('u_')
            source = middleman_to_source_dict[middleman]
            delete_all(source)
            tag = '房产中介'
            with open(os.path.join(basedir, filename), 'rb') as f_in:
                for line in f_in:
                    line = line.lstrip('\n\r')
                    if not line: