def __init__(self, page_queue, models_queue, table_name, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.pages_queue = page_queue
     self.models_queue = models_queue
     self.engine = get_sql_conn()
     self.table_name = table_name
     self.req = Request(logger)
Esempio n. 2
0
class NoticeListHandle(object):
    def __init__(self, save_path):
        self.base_url = "https://www.cn357.com"
        self.req = Request(logger)
        self.save_path = save_path

    def single_notice(self, notice):
        """单批次一页的数据"""
        save_filename = os.path.join(self.save_path, "公众型号各批次数据.csv")
        df, max_page = self.page_one(notice)
        logger.debug(
            f"url:{self.req.url},max page:{max_page}, page item:{len(df)}")
        df.to_csv(save_filename, encoding='utf-8-sig', index=False)
        for i in range(2, int(max_page) + 1):
            df_tmp, _ = self.page_one(notice + f"_{i}")
            df_tmp.to_csv(save_filename,
                          mode='a',
                          encoding='utf-8-sig',
                          index=False,
                          header=False)

    def page_one(self, notice):
        self.req.url = self.base_url + notice
        res = self.req.get()
        html = etree.HTML(res.text)
        row_href = html.xpath(
            "//table[@class='listTable uiLinkList']/tr/td/a/@href")
        # title_row = html.xpath("//table[@class='listTable uiLinkList']/tr/th/text()")
        # a_text = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/text()")
        # row = html.xpath("//table[@class='listTable uiLinkList']/tr/td/text()")
        max_page = html.xpath("//span[@class='pageList']/a/text()")[-2]

        df = pd.read_html(res.text)[0]
        df['href'] = row_href

        return df, max_page

    def start(self):
        """公众型号批次列表"""
        self.req.url = self.base_url + "/notice_list/"
        res = self.req.get()
        if not res or res.status_code != 200:
            logger.error(f"{self.req.url}:{res}")
            return
        # html解析
        html = etree.HTML(res.text)
        result = html.xpath(
            "//div[@class='lotList uiLinkList clear']//a/@href")

        for notice in result:
            self.single_notice(notice)
def req_notice_list(queue: Queue):
    """公众型号批次列表"""
    req = Request(logger)
    req.url = BASE_URL + "/notice_list/"
    res = req.get()
    if not res or res.status_code != 200:
        logger.error(f"{req.url}:{res}")
        return
    # html解析
    html = etree.HTML(res.text)
    result = html.xpath("//div[@class='lotList uiLinkList clear']//a/@href")

    for notice in result:
        queue.put(notice)
class Producer(threading.Thread):
    """公众号批次  记录每个批次有多少页"""
    def __init__(self, notice_queue, page_queue, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.notices_queue = notice_queue
        self.pages_queue = page_queue
        self.req = Request(logger)

    def run(self) -> None:
        while not self.notices_queue.empty():
            notice = self.notices_queue.get(timeout=10)
            try:
                time.sleep(random.uniform(1, 3))
                self.pages(notice)
            except Exception as e:
                logger.exception("{}:{}".format(notice, str(e)))

        logger.debug('Producer finished!!!')

    def pages(self, notice):
        """单批次多少页的数据"""
        self.pages_queue.put(notice)
        max_page = self.page_one(notice)
        logger.debug(f"url:{self.req.url},max page:{max_page}")

        for i in range(2, int(max_page) + 1):
            self.pages_queue.put(notice + f"_{i}")

    def page_one(self, page) -> pd.DataFrame:
        """每一页数据解析"""
        self.req.url = BASE_URL + page
        res = self.req.get()
        html = etree.HTML(res.text)
        # row_href = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/@href")
        # title_row = html.xpath("//table[@class='listTable uiLinkList']/tr/th/text()")
        # a_text = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/text()")
        # row = html.xpath("//table[@class='listTable uiLinkList']/tr/td/text()")
        page_list = html.xpath("//span[@class='pageList']/a/text()")
        max_page = 0 if len(page_list) < 2 else page_list[-2]
        # df = pd.read_html(res.text)[0]
        # df['href'] = row_href
        #
        # # 保存每一个公众型号详细信息跳转的地址
        # for href in row_href:
        #     self.models_queue.put(href)

        return max_page
Esempio n. 5
0
    def run(self):
        rcv = ""
        rbd = ""
        time_to_read = False
        left_to_read = 1
        req = Request()
        while True:
            try:
                data = self.connection.recv(32)
                if data:
                    d = data.decode()

                    if time_to_read:
                        print(rbd)
                        rbd += d
                    else:
                        rcv += d

                    if rcv[-4:] == '\r\n\r\n':
                        req.load_header(rcv)
                        time_to_read = True
                        left_to_read = req.headers.get('content-length')
                        left_to_read = 0 if left_to_read == None else int(
                            left_to_read)

                    if len(rbd) >= left_to_read:
                        #end of command, proses string
                        req.load_body(rbd)
                        logging.warning("data dari client: {}".format(rcv))
                        hasil = httpserver.proses(req)
                        hasil = hasil + "\r\n\r\n"
                        logging.warning("balas ke  client: {}".format(hasil))
                        self.connection.sendall(hasil.encode())
                        rcv = ""
                        self.connection.close()

                else:
                    break
            except OSError as e:
                pass
        self.connection.close()
Esempio n. 6
0
from req import Request
from menu import Menu

requisicao = Request()
menu = Menu()

INTERVALO_NAO_ZERO_OU_NEGATIVO = 0
VOLTAR = 9


class Moedas:
    def escolha_menu(self, option: int):
        self.option = option
        escolha_dic = {
            0: lambda temp: self.escolhas_moeda(self.tmp_moeda),
            1: lambda temp: self.escolhas_moeda_intervalo(self.tmp_moeda),
        }
        if self.option not in escolha_dic.keys():
            raise KeyError
        else:
            print(menu.mostra_moedas_menu())
            self.tmp_moeda = int(input('Escolha:'))
            if self.tmp_moeda == VOLTAR:
                return ''
            else:
                return escolha_dic[self.option](self.tmp_moeda)

    def escolhas_moeda(self, entrada_tipo_moeda: int):
        self.moeda = entrada_tipo_moeda
        self.tmp_moeda = ''
        self.moeda_dic = {
Esempio n. 7
0
 def __init__(self, save_path):
     self.base_url = "https://www.cn357.com"
     self.req = Request(logger)
     self.save_path = save_path
 def __init__(self, notice_queue, page_queue, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.notices_queue = notice_queue
     self.pages_queue = page_queue
     self.req = Request(logger)
class PageConsumer(threading.Thread):
    """没一页的消费者,同时也是每个公众型号详细信息生产者"""
    def __init__(self, page_queue, models_queue, table_name, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pages_queue = page_queue
        self.models_queue = models_queue
        self.engine = get_sql_conn()
        self.table_name = table_name
        self.req = Request(logger)

    def page_one(self, page) -> pd.DataFrame:
        """每一页数据解析"""
        self.req.url = BASE_URL + page
        res = self.req.get()
        if not res or res.status_code != 200:
            return None

        html = etree.HTML(res.text)
        row_href = html.xpath(
            "//table[@class='listTable uiLinkList']/tr/td/a/@href")
        # title_row = html.xpath("//table[@class='listTable uiLinkList']/tr/th/text()")
        # a_text = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/text()")
        # row = html.xpath("//table[@class='listTable uiLinkList']/tr/td/text()")
        # max_page = html.xpath("//span[@class='pageList']/a/text()")[-2]

        df = pd.read_html(res.text)[0]
        df['href'] = row_href

        # 页码
        sp_list = page.split('_')
        page_id = 1 if len(sp_list) == 2 else sp_list[-1]
        df['page_id'] = page_id
        df.drop_duplicates(inplace=True)
        # 保存每一个公众型号详细信息跳转的地址
        for href in row_href:
            self.models_queue.put(href)

        return df

    def run(self) -> None:
        while not self.pages_queue.empty():
            page_id = self.pages_queue.get(timeout=10)
            try:
                time.sleep(random.uniform(1, 3))
                df = self.page_one(page_id)
                # lock.acquire()
                df.to_sql(self.table_name,
                          self.engine,
                          if_exists='append',
                          index=False)
                # lock.release()
                del df
            except Exception as e:
                # 写入错误日志中
                logger.error(f"{page_id}-PageConsumer failure")
                logger.exception("{}:{}".format(page_id, str(e)))

        logger.debug('PageConsumer finished!!!')

        try:
            self.engine.dispose()
        except Exception as e:
            logger.warning(str(e))