def __init__(self): self.conn = get_sql_con() self.cursor = self.conn.cursor() self.sql = ''' insert into realtor_list_page_json(jsonData,optionDate) values(%s,now()) ''' self.sql2 = '''
def execute_spider_close(self): conn = get_sql_con() # 将realtor_list_json表中的数据拆分开,并删除空的情况 self.splite_list_data(conn) # 找到有的propertyId 并且lastUpate和address字段改变了的,这里应该使用批量更新 self.update_detail_data(conn, 100) # 找到detail_page_json 表中没有的propertyId,并将它插入到该表中; self.insert_detail_data(conn) # 删除在split中没有,但是detail有的数据 self.delete_not_exit(conn, 100) # 将搜索条件插入到redis中 self.get_detail_url(conn) conn.close()
def truncate_list_json_and_split_table(): """ 清空realtor_list_page_json 表和realtor_list_page_json_split 表 :return: """ from crawl_tools.get_sql_con import get_sql_con truncate_realtor_list_str = ''' TRUNCATE tb_realtor_list_page_json; ''' truncate_realtor_list_splite_str = ''' TRUNCATE tb_realtor_list_page_json_splite ''' conn = get_sql_con() cursor = conn.cursor() cursor.execute(truncate_realtor_list_str) conn.commit() cursor.execute(truncate_realtor_list_splite_str) conn.commit() conn.close() print('清空realtor_list_page_json 表和清空清空realtor_list_page_json_splite 表成功')
def get_detail_url(): from crawl_tools.get_sql_con import get_sql_con import redis pool = redis.ConnectionPool(host='127.0.0.1', # password='******' ) redis_pool = redis.Redis(connection_pool=pool) conn = get_sql_con() cursor = conn.cursor() sql_string = ''' SELECT property_id FROM tb_realtor_detail_json limit 10,30 ''' cursor.execute(sql_string) for result in cursor.fetchall(): redis_pool.lpush('realtor:property_id', 'http://{}'.format(result[0])) # redis_pool.lpush('realtor:property_id', result[0]) cursor.close() conn.commit() conn.close()
def get_detail_url(): conn = get_sql_con() cursor = conn.cursor() sql_string = ''' SELECT propertyId FROM realtor_detail_json WHERE isDirty = '1' OR detailJson IS NULL ''' url_lists=[] cursor.execute(sql_string) for result in cursor.fetchall(): # print(result) url = 'https://mapi-ng.rdc.moveaws.com/api/v1/properties/{}?client_id=rdc_mobile_native%2C9.3.7%2Candroid'.format(result[0]) url_lists.append(url) print(len(url_lists)) url_lists_string = ','.join(url_lists) cursor.close() conn.commit() conn.close() return url_lists_string
def __init__(self): self.conn = get_sql_con() self.cursor = self.conn.cursor() self.sql = '''
def __init__(self): self.conn = get_sql_con()
import datetime from scrapy.cmdline import execute from crawl_tools.get_sql_con import get_sql_con truncate_realtor_list_str = ''' TRUNCATE realtor_list_page_json; ''' truncate_realtor_list_splite_str = ''' TRUNCATE realtor_list_page_json_splite ''' conn = get_sql_con() cursor = conn.cursor() cursor.execute(truncate_realtor_list_str) conn.commit() cursor.execute(truncate_realtor_list_splite_str) conn.commit() conn.close() print('清空realtor_list_page_json 表和清空清空realtor_list_page_json_splite 表成功') scrapy_start_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') execute(['scrapy', 'crawl', # 'realtor', # 'realtor_app', # 'realtor_property_web', 'realtor_app_list_page', # "-a", # "start_urls={}".format(start_urls),