Example #1
0
def check_data():
    '''
    数据检查
    :return:
    '''
    data_df = pd.DataFrame()
    info_df = pd.DataFrame()
    table_info = conf.qiye_info_table
    table_data = conf.qiye_data_table
    need_url = []
    sql_info = '''SELECT DISTINCT url FROM {} '''.format(table_info)
    sql_data = '''SELECT DISTINCT url FROM {}  where url is not null '''.format(
        table_data)
    res_info = dbhandler.get_date(sql_info, table_info)
    res_data = dbhandler.get_date(sql_data, table_data)
    if res_info:
        info_df = pd.DataFrame(list(res_info), columns=['url'])
        info_df['url'] = info_df['url'].apply(str)
    if res_data:
        data_df = pd.DataFrame(list(res_data), columns=['url'])
        data_df['url'] = data_df['url'].apply(str)
    print(len(list(data_df['url'])))
    for url in list(data_df['url']):
        if url not in list(info_df['url']):
            need_url.append(url)
    print(need_url)
    print(len(need_url))
    sql = '''update {} set status=0 where url in {}'''.format(
        table_data, tuple(need_url))
    up_bo = dbhandler.exec_sql(sql, table_data)
    print(up_bo)
Example #2
0
def need_parsing_url():
    table_name = conf.qiye_data_table
    sql = 'select url from {} where status=0 limit 50 '.format(table_name)
    res = dbhandler.get_date(sql, table_name)
    if res:
        data_df = pd.DataFrame(list(res), columns=['url'])
        data_df['url'] = data_df['url'].apply(str)
    return data_df
Example #3
0
def get_data():
    '''

    @return:
    '''
    sql = '''SELECT url,book_name,book_author,book_isdn,score,com_num from douban_book;'''
    res = dbhandler.get_date(sql, conf.douban_book_table)
    pd.DataFrame(list(res),
                 columns=[
                     'url', 'book_name', 'book_author', 'book_isdn', 'score',
                     'com_num'
                 ]).to_excel('./豆瓣书籍数据.xlsx', index=False)
Example #4
0
def get_book_already():
    '''
    获取已经爬取过的数据url
    @return:
    '''
    df = pd.DataFrame()
    table_name = conf.douban_book_table
    sql = '''SELECT DISTINCT url FROM {}'''.format(table_name)
    res = dbhandler.get_date(sql, table_name)
    if res:
        df = pd.DataFrame(list(res), columns=['url'])
    return df
Example #5
0
def all_shop_phone(pw):
    '''
    获取全部店铺的url
    @return:
    '''
    df = pd.DataFrame()
    table_name = conf.dzdp_shop_table
    sql = '''SELECT DISTINCT shop_name,url,city from {} where shop_type='{}' '''.format(table_name, pw)
    res = dbhandler.get_date(sql, table_name)
    if res:
        df = pd.DataFrame(list(res), columns=['shop_name', 'url', 'city'])
    return df
Example #6
0
def get_all_shop():
    '''
    获取商铺列表中的所有商铺
    @return:
    '''
    dataDf = pd.DataFrame()
    tablaName = conf.dzdp_shop_table
    sql = '''select DISTINCT shop_name,shop_id,url from {} '''.format(tablaName)
    res = dbhandler.get_date(sql, tablaName)
    if res:
        dataDf = pd.DataFrame(list(res), columns=['shop_name', 'shop_id', 'url'])
        dataDf['shop_id'] = dataDf['shop_id'].astype(str)
    return dataDf
Example #7
0
def get_cities():
    '''
    获取数据库中所有的城市
    @return:
    '''
    city_list = []
    table_name = conf.area_division_table
    sql = '''select distinct city_name from {}'''.format(table_name)
    res = dbhandler.get_date(sql, conf.area_division_table)
    if res:
        df = pd.DataFrame(list(res), columns=['city_name'])
        city_list = df['city_name'].tolist()
    return city_list
Example #8
0
def al_prov_city(s_type, pw):
    '''
    获取数据库中已经计算过的城市
    @return:
    '''
    df = pd.DataFrame()
    table_name = conf.gaodemap_baidu_data_table
    sql = '''SELECT DISTINCT coun_code,coun_name FROM {} where s_type={} and shop_type='{}' '''.format(
        table_name, s_type, pw)
    res = dbhandler.get_date(sql, table_name)
    if res:
        df = pd.DataFrame(list(res), columns=['coun_code', 'coun_name'])
    return df
Example #9
0
def get_pro_city(pro_name):
    '''
    获取省份的城市信息
    @param pro_name: 省份名称
    @return:
    '''
    dataDf = pd.DataFrame()
    table_name = conf.area_division_table
    sql = '''SELECT DISTINCT prov_name,city_name from {} WHERE prov_name='{}' '''.format(table_name, pro_name)
    res = dbhandler.get_date(sql, table_name)
    if res:
        dataDf = pd.DataFrame(list(res), columns=['pro_name', 'city_name'])
        dataDf['city_name'] = dataDf['city_name'].apply(lambda x: x.replace('市', ''))
    return dataDf
Example #10
0
def already_cityArea(city_name):
    '''
    获取该城市已经采集过数据的行政区
    @param city_name: 城市名称
    @return:
    '''
    city_area_df = pd.DataFrame()
    tableName = conf.dzdp_shop_table
    sql = '''select distinct city,region from {} where city ='{}' '''.format(tableName, city_name)
    res = dbhandler.get_date(sql, tableName)
    if res:
        city_area_df = pd.DataFrame(list(res), columns=['city_name', 'area_name'])

    return city_area_df
Example #11
0
def already_shop_phone(pw):
    '''
    获取以及计算获取过手机号的店铺
    @return:
    '''
    url_list = []
    table_name = conf.dzdp_shop_phone_table
    sql = '''
        SELECT DISTINCT(url) from {} where shop_type='{}' '''.format(table_name, pw)
    res = dbhandler.get_date(sql, table_name)
    if res:
        url_df = pd.DataFrame(list(res), columns=['url'])
        url_list = url_df['url'].tolist()
    return url_list
Example #12
0
def get_needshop_info2():
    '''
    获取已经采集过的商铺手机号信息
    @return:
    '''
    dataDf = pd.DataFrame()
    tableName = conf.dzdp_shop_phone_table
    sql = '''select DISTINCT a.shop_name,a.city,a.region,a.shop_id,a.url from {} a where a.shop_id not in (SELECT DISTINCT shop_id from {})'''.format(
        conf.dzdp_shop_table, tableName)
    print(sql)
    res = dbhandler.get_date(sql, tableName)
    if res:
        dataDf = pd.DataFrame(list(res), columns=['shop_name', 'city', 'region', 'shop_id', 'url'])
        dataDf['shop_id'] = dataDf['shop_id'].astype(str)
    return dataDf
Example #13
0
def get_need_city(pro_list):
    '''
    获取需要的城市
    @return:
    '''
    df = pd.DataFrame()
    table_name = conf.area_division_table
    if 1 == len(pro_list):
        sql = '''SELECT DISTINCT prov_code,prov_name,city_code,city_name,coun_code,coun_name FROM {} 
        where prov_name in {} order by prov_name'''.format(table_name, pro_list[0])
    else:
        sql = '''SELECT DISTINCT prov_code,prov_name,city_code,city_name,coun_code,coun_name FROM {} 
        where prov_name in {} order by prov_name'''.format(table_name, tuple(pro_list))
    res = dbhandler.get_date(sql, table_name)
    if res:
        df = pd.DataFrame(list(res), columns=['prov_code', 'prov_name', 'city_code',
                                              'city_name', 'coun_code', 'coun_name'])
    return df