def __init__(self, page_queue, models_queue, table_name, *args, **kwargs): super().__init__(*args, **kwargs) self.pages_queue = page_queue self.models_queue = models_queue self.engine = get_sql_conn() self.table_name = table_name self.req = Request(logger)
class NoticeListHandle(object): def __init__(self, save_path): self.base_url = "https://www.cn357.com" self.req = Request(logger) self.save_path = save_path def single_notice(self, notice): """单批次一页的数据""" save_filename = os.path.join(self.save_path, "公众型号各批次数据.csv") df, max_page = self.page_one(notice) logger.debug( f"url:{self.req.url},max page:{max_page}, page item:{len(df)}") df.to_csv(save_filename, encoding='utf-8-sig', index=False) for i in range(2, int(max_page) + 1): df_tmp, _ = self.page_one(notice + f"_{i}") df_tmp.to_csv(save_filename, mode='a', encoding='utf-8-sig', index=False, header=False) def page_one(self, notice): self.req.url = self.base_url + notice res = self.req.get() html = etree.HTML(res.text) row_href = html.xpath( "//table[@class='listTable uiLinkList']/tr/td/a/@href") # title_row = html.xpath("//table[@class='listTable uiLinkList']/tr/th/text()") # a_text = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/text()") # row = html.xpath("//table[@class='listTable uiLinkList']/tr/td/text()") max_page = html.xpath("//span[@class='pageList']/a/text()")[-2] df = pd.read_html(res.text)[0] df['href'] = row_href return df, max_page def start(self): """公众型号批次列表""" self.req.url = self.base_url + "/notice_list/" res = self.req.get() if not res or res.status_code != 200: logger.error(f"{self.req.url}:{res}") return # html解析 html = etree.HTML(res.text) result = html.xpath( "//div[@class='lotList uiLinkList clear']//a/@href") for notice in result: self.single_notice(notice)
def req_notice_list(queue: Queue): """公众型号批次列表""" req = Request(logger) req.url = BASE_URL + "/notice_list/" res = req.get() if not res or res.status_code != 200: logger.error(f"{req.url}:{res}") return # html解析 html = etree.HTML(res.text) result = html.xpath("//div[@class='lotList uiLinkList clear']//a/@href") for notice in result: queue.put(notice)
class Producer(threading.Thread): """公众号批次 记录每个批次有多少页""" def __init__(self, notice_queue, page_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.notices_queue = notice_queue self.pages_queue = page_queue self.req = Request(logger) def run(self) -> None: while not self.notices_queue.empty(): notice = self.notices_queue.get(timeout=10) try: time.sleep(random.uniform(1, 3)) self.pages(notice) except Exception as e: logger.exception("{}:{}".format(notice, str(e))) logger.debug('Producer finished!!!') def pages(self, notice): """单批次多少页的数据""" self.pages_queue.put(notice) max_page = self.page_one(notice) logger.debug(f"url:{self.req.url},max page:{max_page}") for i in range(2, int(max_page) + 1): self.pages_queue.put(notice + f"_{i}") def page_one(self, page) -> pd.DataFrame: """每一页数据解析""" self.req.url = BASE_URL + page res = self.req.get() html = etree.HTML(res.text) # row_href = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/@href") # title_row = html.xpath("//table[@class='listTable uiLinkList']/tr/th/text()") # a_text = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/text()") # row = html.xpath("//table[@class='listTable uiLinkList']/tr/td/text()") page_list = html.xpath("//span[@class='pageList']/a/text()") max_page = 0 if len(page_list) < 2 else page_list[-2] # df = pd.read_html(res.text)[0] # df['href'] = row_href # # # 保存每一个公众型号详细信息跳转的地址 # for href in row_href: # self.models_queue.put(href) return max_page
def run(self): rcv = "" rbd = "" time_to_read = False left_to_read = 1 req = Request() while True: try: data = self.connection.recv(32) if data: d = data.decode() if time_to_read: print(rbd) rbd += d else: rcv += d if rcv[-4:] == '\r\n\r\n': req.load_header(rcv) time_to_read = True left_to_read = req.headers.get('content-length') left_to_read = 0 if left_to_read == None else int( left_to_read) if len(rbd) >= left_to_read: #end of command, proses string req.load_body(rbd) logging.warning("data dari client: {}".format(rcv)) hasil = httpserver.proses(req) hasil = hasil + "\r\n\r\n" logging.warning("balas ke client: {}".format(hasil)) self.connection.sendall(hasil.encode()) rcv = "" self.connection.close() else: break except OSError as e: pass self.connection.close()
from req import Request from menu import Menu requisicao = Request() menu = Menu() INTERVALO_NAO_ZERO_OU_NEGATIVO = 0 VOLTAR = 9 class Moedas: def escolha_menu(self, option: int): self.option = option escolha_dic = { 0: lambda temp: self.escolhas_moeda(self.tmp_moeda), 1: lambda temp: self.escolhas_moeda_intervalo(self.tmp_moeda), } if self.option not in escolha_dic.keys(): raise KeyError else: print(menu.mostra_moedas_menu()) self.tmp_moeda = int(input('Escolha:')) if self.tmp_moeda == VOLTAR: return '' else: return escolha_dic[self.option](self.tmp_moeda) def escolhas_moeda(self, entrada_tipo_moeda: int): self.moeda = entrada_tipo_moeda self.tmp_moeda = '' self.moeda_dic = {
def __init__(self, save_path): self.base_url = "https://www.cn357.com" self.req = Request(logger) self.save_path = save_path
def __init__(self, notice_queue, page_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.notices_queue = notice_queue self.pages_queue = page_queue self.req = Request(logger)
class PageConsumer(threading.Thread): """没一页的消费者,同时也是每个公众型号详细信息生产者""" def __init__(self, page_queue, models_queue, table_name, *args, **kwargs): super().__init__(*args, **kwargs) self.pages_queue = page_queue self.models_queue = models_queue self.engine = get_sql_conn() self.table_name = table_name self.req = Request(logger) def page_one(self, page) -> pd.DataFrame: """每一页数据解析""" self.req.url = BASE_URL + page res = self.req.get() if not res or res.status_code != 200: return None html = etree.HTML(res.text) row_href = html.xpath( "//table[@class='listTable uiLinkList']/tr/td/a/@href") # title_row = html.xpath("//table[@class='listTable uiLinkList']/tr/th/text()") # a_text = html.xpath("//table[@class='listTable uiLinkList']/tr/td/a/text()") # row = html.xpath("//table[@class='listTable uiLinkList']/tr/td/text()") # max_page = html.xpath("//span[@class='pageList']/a/text()")[-2] df = pd.read_html(res.text)[0] df['href'] = row_href # 页码 sp_list = page.split('_') page_id = 1 if len(sp_list) == 2 else sp_list[-1] df['page_id'] = page_id df.drop_duplicates(inplace=True) # 保存每一个公众型号详细信息跳转的地址 for href in row_href: self.models_queue.put(href) return df def run(self) -> None: while not self.pages_queue.empty(): page_id = self.pages_queue.get(timeout=10) try: time.sleep(random.uniform(1, 3)) df = self.page_one(page_id) # lock.acquire() df.to_sql(self.table_name, self.engine, if_exists='append', index=False) # lock.release() del df except Exception as e: # 写入错误日志中 logger.error(f"{page_id}-PageConsumer failure") logger.exception("{}:{}".format(page_id, str(e))) logger.debug('PageConsumer finished!!!') try: self.engine.dispose() except Exception as e: logger.warning(str(e))