def parse_item(fname): """ 解析单个页面 :param page: :return: """ cur_day = getdatefrompath(fname) if cur_day is None: return if os.path.exists(fname): html_hd = open(fname, mode='rb') soup = BeautifulSoup(html_hd, "lxml") divs = soup.find_all(id='singleweibo') for div in divs: if div: weibo_id = "" weibo_author = div.find(id="singleweiboauthor") if weibo_author: if weibo_author.p: try: weibo_id = parse_author(weibo_author.p.text, cur_day) except Exception as e: logging.error("[PARSE_AUTHOR_FAILED]: %s %s %s" % (cur_day, str(e), weibo_author.p)) weibo_body = div.find(id="singleweibobody") if weibo_body: try: r = parse_body(weibo_body.p, cur_day) if r: tag = r[0] urls = r[1] title = r[2] if urls: for url in urls: o, ext = parse_url(url) domain = o.netloc url_path = o.path root_domain = ext.domain + "." + ext.suffix result = (cur_day, tag, url, title, root_domain, domain, url_path, weibo_id) yield result except Exception as e: logging.error("[PARSE_BODY_FAILED]: %s %s %s" % (cur_day, str(e), weibo_body.p)) html_hd.close()
def parse_item(html_hd): """ 解析单个页面 :param page: :return: """ soup = BeautifulSoup(html_hd, "lxml") # find_day # 2014/03/03-2014/03/09 day = soup.find("blockquote").text p = re.compile(r'(\d{4})\/(\d{2})\/(\d{2})') m = re.search(p, day) if m: ts = m.group(1) + m.group(2) + m.group(3) else: return page = soup.find(id="content") for div in page.find_all("div", class_='single'): sts = div.stripped_strings tag = sts.next() if tag.find("[") != -1: tag = tag[1:-1] title = sts.next() # url for url in div.find_all("a"): url = url["href"] o, ext = parse_url(url) domain = o.netloc url_path = o.path root_domain = ext.domain + "." + ext.suffix result = (ts, tag, url, title, root_domain, domain, url_path) yield result
def scraw(so, proxy=None, delta=3): """ :param so: :param proxy: :return: """ ts_list = [ get_special_date(delta, format="%Y-%m-%d") for delta in range(0, 0 - delta, -1) ] url = "https://www.sec-wiki.com/?2019-03-04" r = get_request(url) if r: try: soup = BeautifulSoup(r.content, 'lxml') except Exception as e: logging.error("GET %s failed : %s" % (url, repr(e))) return if soup: rows = soup.find_all("span", class_='dropcap') if rows: for row in rows: if row: cur_ts = row.get_text() if cur_ts in ts_list: a = row.next_sibling if a: url = a.get("href") o, ext = parse_url(url) domain = o.netloc cur_ts = re.sub("-", "", cur_ts) title = strip_n(a.get_text()) overview = {} overview['ts'] = cur_ts overview['url'] = url overview['title'] = title overview['domain'] = domain overview["domain_name"] = \ str(get_title(overview["domain"], proxy=proxy)) if overview: sql = d2sql(overview, table="secwiki_today_detail", action="INSERT OR IGNORE ") if sql: try: so.execute(sql) except Exception as e: logging.error( "[secwiki_today_sql]: " "sql(%s) error(%s)" % (sql, str(e))) st = "{ts}\t{url}" \ "\t{title}\t{domain}\t{domain_name}".format( ts=overview.get("ts"), domain=overview.get("domain"), title=overview.get("title"), domain_name=overview.get("domain_name"), url=overview.get("url") ) print st url = overview.get("url") ts = overview.get("ts") tag = overview.get("tag", "") title = overview.get("title") sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: #print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e)))
def parse_item(fname, so=None, proxy=None): """ 解析单个页面 :param page: :return: """ cur_day = getdatefrompath(fname) if cur_day is None: return if os.path.exists(fname): html_hd = open(fname, mode='rb') soup = BeautifulSoup(html_hd, "lxml") divs = soup.find_all(id='singleweibo') for div in divs: if div: weibo_id = "" weibo_author = div.find(id="singleweiboauthor") if weibo_author: if weibo_author.p: try: weibo_id = parse_author(weibo_author.p.text, cur_day) except Exception as e: logging.error("[PARSE_AUTHOR_FAILED]: %s %s %s" % (cur_day, str(e), weibo_author.p)) weibo_body = div.find(id="singleweibobody") if weibo_body: try: r = parse_body(weibo_body.p, cur_day) if r: tag = r[0] urls = r[1] title = r[2] if urls: for url in urls: o, ext = parse_url(url) domain = o.netloc url_path = o.path root_domain = ext.domain + "." + ext.suffix result = (cur_day, tag, url, title, root_domain, domain, url_path, weibo_id) title = strip_n(title) sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=cur_day, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, cur_day, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=cur_day, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e))) yield result except Exception as e: logging.error("[PARSE_BODY_FAILED]: %s %s %s" % (cur_day, str(e), weibo_body.p)) html_hd.close()
def parse_item(html_hd, so=None, proxy=None): """ 解析单个页面 :param page: :return: """ soup = BeautifulSoup(html_hd, "lxml") # find_day # 2014/03/03-2014/03/09 day = soup.find("blockquote").text p = re.compile(r'(\d{4})\/(\d{2})\/(\d{2})') m = re.search(p, day) if m: ts = m.group(1) + m.group(2) + m.group(3) else: return page = soup.find(id="content") for div in page.find_all("div", class_='single'): sts = div.stripped_strings tag = sts.next() if tag.find("[") != -1: tag = tag[1:-1] title = sts.next() # url for url in div.find_all("a"): url = url["href"] o, ext = parse_url(url) domain = o.netloc url_path = o.path root_domain = ext.domain + "." + ext.suffix title = strip_n(title) domain_name = "" try: domain_name = get_title(domain) except Exception as e: logging.error("[get_domain_name]: %s %s" % (domain_name, str(e))) if domain_name: domain_name = re.sub('\x22', '', domain_name) domain_name = re.sub('\x27', '', domain_name) update_sql = "update {table} set domain_name='{title}' where domain='{domain}';".format( table="secwiki_detail", title=domain_name, domain=domain) try: so.execute(update_sql) print update_sql except Exception as e: logging.error("[update_sql]: %s str(%s)" % (update_sql, str(e))) sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e))) result = (ts, tag, url, title, root_domain, domain, url_path) yield result