class FaceSpider: def __init__(self): self.downloader = HtmlDownloader() self.urlmanager = UrlManager() def crawl(self, path): url = self.urlmanager.img_urls self.downloader.save_fig(url, path)
def parse(self, url): response = HtmlDownloader.download(url) tree = etree.HTML(response) nodes = tree.xpath("//td/text()") txt = "" for count in range(len(nodes)): if (count % 7 == 0): txt = txt + "('" + nodes[count] + "'" txt = txt + ",'" + nodes[count + 1] + "'" txt = txt + ",'" + nodes[count + 3] + "')," txt = txt[:-1] sql = "insert into iprecord(ip,port,protocol) values" + txt print(sql) self.db.execute(sql)
class MenetSpider: def __init__(self): self.downloader = HtmlDownloader() self.urlmanager = UrlManager() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, i): try: print(f"Process {i} is running") url = self.urlmanager.get_new_url(i) html = self.downloader.download(url) data = self.parser.parser(html) return data except: print(f"crawl failed at {i}") return pd.DataFrame([0, 0, 0, 0, 0, 0, 0, 0, 0], columns=[ '编码', "药品名称", "生产企业", "批文文号", "商品名", "剂型", "规格", "进口国产", "批准日期" ])
def __init__(self): self.downloader = HtmlDownloader() self.urlmanager = UrlManager()
def __init__(self): self.downloader = HtmlDownloader() self.urlmanager = UrlManager() self.parser = HtmlParser() self.output = DataOutput()