def scraw(so, proxy=None, delta=3): """ :param so: :param proxy: :return: """ ts_list = [ get_special_date(delta, format="%Y-%m-%d") for delta in range(0, 0 - delta, -1) ] url = "https://www.sec-wiki.com/?2019-03-04" r = get_request(url) if r: try: soup = BeautifulSoup(r.content, 'lxml') except Exception as e: logging.error("GET %s failed : %s" % (url, repr(e))) return if soup: rows = soup.find_all("span", class_='dropcap') if rows: for row in rows: if row: cur_ts = row.get_text() if cur_ts in ts_list: a = row.next_sibling if a: url = a.get("href") o, ext = parse_url(url) domain = o.netloc cur_ts = re.sub("-", "", cur_ts) title = strip_n(a.get_text()) overview = {} overview['ts'] = cur_ts overview['url'] = url overview['title'] = title overview['domain'] = domain overview["domain_name"] = \ str(get_title(overview["domain"], proxy=proxy)) if overview: sql = d2sql(overview, table="secwiki_today_detail", action="INSERT OR IGNORE ") if sql: try: so.execute(sql) except Exception as e: logging.error( "[secwiki_today_sql]: " "sql(%s) error(%s)" % (sql, str(e))) st = "{ts}\t{url}" \ "\t{title}\t{domain}\t{domain_name}".format( ts=overview.get("ts"), domain=overview.get("domain"), title=overview.get("title"), domain_name=overview.get("domain_name"), url=overview.get("url") ) print st url = overview.get("url") ts = overview.get("ts") tag = overview.get("tag", "") title = overview.get("title") sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: #print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e)))
def get_network_id(so, source="weixin", proxy=None, retry=3, timeout=10): """ :param so: :param source: :param renew: :return: """ if source == "weixin": keyword = "://mp.weixin.qq.com/" elif source == "github": keyword = "https://github.com/" elif source == "twitter": keyword = "//twitter.com" elif source == "zhihu": keyword = "://zhuanlan.zhihu.com" elif source == "weibo": keyword = "://weibo.com" elif source == "medium": keyword = "medium.com" else: return # get urls for info_source in ["secwiki", "xuanwu"]: sql = "select distinct url,title,ts,tag from {source}_detail where url like '%{keyword}%'".format( keyword=keyword, source=info_source) result = so.query(sql) for item in result: url = item[0] title = item[1] ts = item[2] tag = item[3] pos = url.find('http', 2) if pos != -1: url = item[0:pos] if not url: continue try: url = re.sub('\x22', '', url) url = re.sub('\x27', '', url) except Exception as e: logging.error("[URL_ERROR]: url(%s) e(%s)" % (url, str(e))) continue try: title = re.sub('\x22', '', title) title = re.sub('\x27', '', title) except Exception as e: logging.error("[title_ERROR]: title(%s) e(%s)" % (url, str(e))) continue update_sql = "" if source == "weixin": details = get_weixin_info(url, ts=ts, tag=tag) if details: update_sql = d2sql(details, table='weixin', action='replace') elif source == "github": details = get_github_info(url, title, ts=ts, tag=tag) if details: update_sql = d2sql(details, table='github', action='replace') elif source == "twitter": details = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy, retry=retry, timeout=timeout) if details: update_sql = d2sql(details, table='twitter', action='replace') else: print url print title if update_sql: try: print update_sql so.execute(update_sql) except Exception as e: logging.error("[replace_failed]: %s e(%s)" % (update_sql, str(e)))
def parse_item(fname, so=None, proxy=None): """ 解析单个页面 :param page: :return: """ cur_day = getdatefrompath(fname) if cur_day is None: return if os.path.exists(fname): html_hd = open(fname, mode='rb') soup = BeautifulSoup(html_hd, "lxml") divs = soup.find_all(id='singleweibo') for div in divs: if div: weibo_id = "" weibo_author = div.find(id="singleweiboauthor") if weibo_author: if weibo_author.p: try: weibo_id = parse_author(weibo_author.p.text, cur_day) except Exception as e: logging.error("[PARSE_AUTHOR_FAILED]: %s %s %s" % (cur_day, str(e), weibo_author.p)) weibo_body = div.find(id="singleweibobody") if weibo_body: try: r = parse_body(weibo_body.p, cur_day) if r: tag = r[0] urls = r[1] title = r[2] if urls: for url in urls: o, ext = parse_url(url) domain = o.netloc url_path = o.path root_domain = ext.domain + "." + ext.suffix result = (cur_day, tag, url, title, root_domain, domain, url_path, weibo_id) title = strip_n(title) sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=cur_day, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, cur_day, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=cur_day, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e))) yield result except Exception as e: logging.error("[PARSE_BODY_FAILED]: %s %s %s" % (cur_day, str(e), weibo_body.p)) html_hd.close()
def scraw(so, proxy=None, delta=2): """ :param so: :param proxy: :return: """ ts_list = [ get_special_date(delta, format="%Y%m%d") for delta in range(0, 0 - delta, -1) ] url = "https://sec.today/pulses/" r = get_request(url) if r: try: soup = BeautifulSoup(r.content, 'lxml') except Exception as e: logging.error("GET %s failed : %s" % (url, repr(e))) return if soup: rows = soup.find_all("div", class_='card-body') if rows: for row in rows: if row: overview = {} card_title = row.find("h5", class_="card-title") if card_title: card_title_text = strip_n(card_title.get_text()) card_title_url = card_title.find( "a", class_="text-dark").get("href") overview["title_english"] = card_title_text sec_url = "https://sec.today%s" % card_title_url url_details = get_redirect_url( sec_url, root_dir="data/sec_url", issql=False, proxy=proxy) #url_details = None if url_details: overview["url"] = url_details.get("url") overview["domain"] = url_details.get("domain") else: overview["url"] = sec_url card_text_chinese = row.find("p", class_="card-text my-1") if card_text_chinese: card_text_chinese = strip_n( card_text_chinese.find("q").get_text()) overview["title"] = card_text_chinese card_text = row.find( "small", class_=re.compile(r"card-subtitle")) if card_text: card_text_domain = strip_n(card_text.get_text()) domain = parse_domain_tag(card_text_domain) if domain: overview["domain"] = domain overview["domain_name"] = str( get_title(overview["domain"], proxy=proxy)) card_text_types = card_text.find_all( "span", class_=re.compile(r"badge-tag")) if card_text_types: tags = [] for card_text_type in card_text_types: card_text_type = strip_n( card_text_type.get_text()) if card_text_type: tags.append(card_text_type) overview["tag"] = ",".join(tags) card_text_ts = row.find("cite") if card_text_ts: card_text_ts = strip_n(card_text_ts.get_text()) domain_ts = parse_sec_today_url(card_text_ts) # print card_text_ts, domain_ts if domain_ts: domain, ts = domain_ts else: ts = get_special_date() overview["ts"] = ts if ts not in ts_list: continue if overview: sql = d2sql(overview, table="xuanwu_today_detail", action="INSERT OR IGNORE ") if sql: try: so.execute(sql) except Exception as e: logging.error( "[sec_total_sql]: sql(%s) error(%s)" % (sql, str(e))) st = "{ts}\t{tag}\t{url}" \ "\t{title}\t{title_english}\t{domain}\t{domain_name}".format( ts=overview.get("ts"), tag=overview.get("tag"), domain=overview.get("domain"), title=overview.get("title"), title_english=overview.get("title_english"), domain_name=overview.get("domain_name"), url=overview.get("url") ) print st # print sql url = overview.get("url") ts = overview.get("ts") tag = overview.get("tag") title = overview.get("title") sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: # print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e)))
def parse_item(html_hd, so=None, proxy=None): """ 解析单个页面 :param page: :return: """ soup = BeautifulSoup(html_hd, "lxml") # find_day # 2014/03/03-2014/03/09 day = soup.find("blockquote").text p = re.compile(r'(\d{4})\/(\d{2})\/(\d{2})') m = re.search(p, day) if m: ts = m.group(1) + m.group(2) + m.group(3) else: return page = soup.find(id="content") for div in page.find_all("div", class_='single'): sts = div.stripped_strings tag = sts.next() if tag.find("[") != -1: tag = tag[1:-1] title = sts.next() # url for url in div.find_all("a"): url = url["href"] o, ext = parse_url(url) domain = o.netloc url_path = o.path root_domain = ext.domain + "." + ext.suffix title = strip_n(title) domain_name = "" try: domain_name = get_title(domain) except Exception as e: logging.error("[get_domain_name]: %s %s" % (domain_name, str(e))) if domain_name: domain_name = re.sub('\x22', '', domain_name) domain_name = re.sub('\x27', '', domain_name) update_sql = "update {table} set domain_name='{title}' where domain='{domain}';".format( table="secwiki_detail", title=domain_name, domain=domain) try: so.execute(update_sql) print update_sql except Exception as e: logging.error("[update_sql]: %s str(%s)" % (update_sql, str(e))) sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e))) result = (ts, tag, url, title, root_domain, domain, url_path) yield result