Esempio n. 1
0
def find_how_many_is_spider():
    MYSQL_CONN = MYSQL()
    zhubo_rows = MYSQL_CONN.select_from_table(ZHUBO_LIVE_TABLE, "is_live = 1")
    len_is_live = []
    for zhubo_row in zhubo_rows:
        len_is_live.append(zhubo_row)
    return len(len_is_live)
Esempio n. 2
0
def delete_live_goods_temp(live_id):
    MYSQL_CONN = MYSQL()
    try:
        MYSQL_CONN.delete_from_table(LIVE_GOODS_TEMP_TABLE,
                                     "live_id={}".format(live_id))
    except Exception as e:
        logging.error(live_id)
        logging.error(e)
Esempio n. 3
0
def spider_basic(user_id):

    MYSQL_COON = MYSQL()

    url_info = "https://taobaolive.taobao.com/api/broadcaster_info/1.0?accountId={}".format(
        user_id)
    url_goods_list = "https://taobaolive.taobao.com/api/item_list/1.0?type=0&liveId="
    url_live = "https://taobaolive.taobao.com/room/index.htm?userId={}".format(
        user_id)

    HEADERS = {
        "User-Agent":
        random.choice(USER_AGENTS),
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':
        'en-US,en;q=0.5',
        'Connection':
        'keep-alive',
        'Accept-Encoding':
        'gzip, deflate',
        #'Host': "taobaolive.taobao.com",
        "Referer":
        "https://taobaolive.taobao.com/room/index.htm?userId={}".format(
            user_id)
    }

    SESSION.headers.update(HEADERS)

    response = session_get(url_live)
    response_text = response.text
    #print response_text
    live_id = re.search(r'liveId":(\d+)', response_text).group(1)
    # print live_id == str(0)

    #live_basic_info = {}

    goods_json_dict = get_goods_list(url_goods_list, live_id)
    info_json_dict = get_info(url_info)

    live_basic_info = dict(goods_json_dict, **info_json_dict)
    live_basic_info["zhubo_id"] = user_id

    live_basic_info["live_id"] = live_id

    if live_id == str(0):
        live_basic_info["is_live"] = 0
    else:
        live_basic_info["is_live"] = 1

    live_basic_info["crawl_time"] = datetime.now()

    live_basic_info["live_url"] = url_live

    #return live_basic_info
    #print repr(live_basic_info).decode("unicode-escape")
    logging.info("Spider one item into {}".format(LIVE_BASIC_TABLE))
    MYSQL_COON.insert_into_table(live_basic_info, LIVE_BASIC_TABLE)
Esempio n. 4
0
def get_zhubo_id_list():
    zhubo_id_list = []
    MYSQL_CONN = MYSQL()
    zhubo_id_row = MYSQL_CONN.select_from_table(ZHUBO_INFO_TABLE, [])
    for each_zhubo in zhubo_id_row:
        zhubo_id = each_zhubo["zhubo_id"]
        zhubo_id_list.append(zhubo_id)

    return zhubo_id_list
Esempio n. 5
0
def insert_to_db(results, which_table):
    MYSQL_CONN = MYSQL()
    for each_result in results:
        try:
            MYSQL_CONN.insert_into_table(each_result, which_table)
            logging.info("spider one item into {}".format(which_table))
        except Exception as e:
            logging.error(str(each_result))
            logging.error(e)
Esempio n. 6
0
def get_goods_id_list():

    goods_id_list = []
    MYSQL_CONN = MYSQL()
    live_goods_row = MYSQL_CONN.select_from_table(LIVE_GOODS_TABLE, [])
    for each_goods in live_goods_row:
        goods_id = each_goods["goods_id"]
        goods_id_list.append(goods_id)
    return goods_id_list
Esempio n. 7
0
def get_goods_id_list_from_temp(live_id):

    goods_id_list = []
    MYSQL_CONN = MYSQL()
    live_goods_row = MYSQL_CONN.select_from_table(LIVE_GOODS_TEMP_TABLE,
                                                  "live_id={}".format(live_id))
    for each_goods in live_goods_row:
        goods_id = each_goods["goods_id"]
        goods_id_list.append(goods_id)
    return goods_id_list
Esempio n. 8
0
def get_rows():
    MYSQL_COON = MYSQL()
    b = []
    a = MYSQL_COON.select_from_table("live_taobao_webstar_crawl_live_goods", *b)

    n = 1
    goods_list = []
    for row in a:
        goods_id = row["goods_id"]
        yield goods_id
    MYSQL_COON.close_db()
Esempio n. 9
0
    def __init__(self, from_table, from_table_condition, need_to_update,
                 which_module, into_table, need_to_return, which_need_in_row,
                 update_into_table):
        super(TASK_OBJECT, self).__init__()

        self.MYSQL_CONN = MYSQL()
        self.from_table = from_table
        self.from_table_condition = from_table_condition
        self.need_to_update = need_to_update
        self.which_module = which_module
        self.into_table = into_table
        self.need_to_return = need_to_return
        self.which_need_in_row = which_need_in_row
        self.update_into_table = update_into_table
Esempio n. 10
0
def update_zhubo_from_db():
    MYSQL_CONN = MYSQL()
    #pool = multiprocessing.Pool(processes=10)
    zhubo_rows = MYSQL_CONN.select_from_table(ZHUBO_LIVE_TABLE, "is_live != 1")
    zhubo_id_list = []

    for zhubo_row in zhubo_rows:
        zhubo_id = zhubo_row["zhubo_id"]
        #zhubo_id_list.append(zhubo_id)
        if str(0) != get_live_id(zhubo_id):
            logging.info("{} is living!.........".format(zhubo_id))
            yield zhubo_id
        else:
            logging.info("{} is not living!".format(zhubo_id))
Esempio n. 11
0
def from_live_goods_to_temp(live_id):
    goods_live_id_list = []
    MYSQL_CONN = MYSQL()
    live_goods_row = MYSQL_CONN.select_from_table(LIVE_GOODS_TABLE,
                                                  "live_id={}".format(live_id))
    for each_goods in live_goods_row:
        goods_id = each_goods["goods_id"]
        live_id = each_goods["live_id"]
        each_item = {"goods_id": goods_id, "live_id": live_id}
        goods_live_id_list.append(each_item)
        #goods_live_id_dict[goods_id] = live_id
        #goods_id_list.append(goods_id)

    insert_to_db(goods_live_id_list, LIVE_GOODS_TEMP_TABLE)
Esempio n. 12
0
class TASK_OBJECT(object):
    '''
    This class has 6 parameters.
    from_table: the module is begin from which table to get the data
    from_table_condition: select from the table's condition, the default is empty list
    need_to_update: the module's begin need to update the table or not
    which_module: which module you need to start
    into_table: the module's results is need to insert into which table
    need_to_return: the module's multiprocessing need to return the data or not
    '''
    def __init__(self, from_table, from_table_condition, need_to_update,
                 which_module, into_table, need_to_return, which_need_in_row,
                 update_into_table):
        super(TASK_OBJECT, self).__init__()

        self.MYSQL_CONN = MYSQL()
        self.from_table = from_table
        self.from_table_condition = from_table_condition
        self.need_to_update = need_to_update
        self.which_module = which_module
        self.into_table = into_table
        self.need_to_return = need_to_return
        self.which_need_in_row = which_need_in_row
        self.update_into_table = update_into_table

    def get_rows(self):

        rows = self.MYSQL_CONN.select_from_table(self.from_table,
                                                 self.from_table_condition)

        for row in rows:
            if self.need_to_update:
                row = get_update_state(row)
            if row:
                yield row

    def multiprocess_task(self, new_list):
        pool = ThreadPool(THREAD_NUM)
        if self.need_to_return:
            results = pool.map(self.which_module, new_list)
        else:
            pool.map(self.which_module, new_list)
        pool.close()
        pool.join()
        if self.need_to_return:
            return results

    def insert_to_db(self, results):

        for each_result in results:
            try:
                if self.update_into_table:
                    self.MYSQL_CONN.insert_into_table_with_replace(
                        each_result, self.into_table)
                else:
                    self.MYSQL_CONN.insert_into_table(each_result,
                                                      self.into_table)
            except Exception as e:
                logging.error(str(each_result))
                logging.error(e)

    def task_main(self):

        rows = self.get_rows()

        new_list = []

        for row in rows:
            new_list.append(row[self.which_need_in_row])
            if len(new_list) % THREAD_NUM == 0:
                if self.need_to_return:
                    results = self.multiprocess_task(new_list)
                    self.insert_to_db(results)
                else:
                    self.multiprocess_task(new_list)
                new_list = []
        if new_list:
            if self.need_to_return:
                results = self.multiprocess_task(new_list)
                self.insert_to_db(results)
            else:
                self.multiprocess_task(new_list)
            new_list = []
Esempio n. 13
0
#coding:utf-8
import logging
import sys
import json
import re
sys.path.append("..")
from MySql_InterFace.mysql_interface import MYSQL
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from spider_zhubo import spider_zhubo

MySQL_COON = MYSQL()


def main():
    b = []
    a = MySQL_COON.select_from_table("live_taobao_webstar_crawl_live_basic",
                                     *b)
    n = 1
    m = 1
    list_url = []
    for row in a:
        #print row["daren_url"]
        user_info_json = row["user_info_json"]
        user_info_json = json.loads(user_info_json)
        zhubo_url = user_info_json["result"]["model"]["broadCaster"]["jumpUrl"]
        #zhubo_url = re.sub(r"//","",zhubo_url)
        #print zhubo_url
        n = n + 1
        list_url.append(zhubo_url)
        if n % 10 == 0:
Esempio n. 14
0
def end_liveing(zhubo_id):
    MYSQL_CONN = MYSQL()
    logging.info("{} zhubo is not living!".format(zhubo_id))
    info_dict = {"zhubo_id": zhubo_id, "is_live": "0"}
    MYSQL_CONN.insert_into_table_exist_update(info_dict, ZHUBO_LIVE_TABLE,
                                              "is_live=0")
Esempio n. 15
0
def start_liveing(zhubo_id):
    info_dict = {"zhubo_id": zhubo_id, "is_live": "1"}
    MYSQL_CONN = MYSQL()
    MYSQL_CONN.insert_into_table_exist_update(info_dict, ZHUBO_LIVE_TABLE,
                                              "is_live=1")