def getmissdate(): """ 获取缺失的日期 :return: """ days = [] cur_fname_lists = [] for fname in glob.glob(r'data/xuanwu/*/*/*/index.html'): cur_fname_lists.append(fname) start, end = getstartendfrompath(cur_fname_lists) cur_day = get_special_date() print "cur_day({cur_day}), last_day({last_day})".format(cur_day=cur_day, last_day=end) if end != 0: last_day = datetime.datetime.strptime(str(end), '%Y%m%d') cur_day = datetime.datetime.strptime(str(cur_day), '%Y%m%d') delta = cur_day - last_day for i in range(1, delta.days + 1): n_day = last_day + datetime.timedelta(days=i) n_day = n_day.strftime('%Y%m%d') days.append(n_day) return days
def update_github(): """ :return: """ ts = get_special_date(format="%Y-%m-%d %H:%m:%S") cmd = "git add . && git commit -m '%s' && git push origin master" % (ts) ret = os.system(cmd) if ret != 0: print "%s failed" % cmd
def draw_readme(year=None): """ :return: """ if year is None: year_month = get_special_date(delta=0, format="%Y%m") year_year = get_special_date(delta=0, format="%Y") year_all = str(year_year)[0:2] for y in [year_month, year_year, year_all]: fpath = draw_readme_item(year=y) if len(str(y)) == 6: fpath_month = fpath fpath_default = "README.md" shutil.copyfile(fpath_month, fpath_default) print(fpath) else: # 绘制指定月份的消息 fpath = draw_readme_item(year=year) print(fpath)
def draw_readme(fpath=None): """ :return: """ if fpath is None: fpath = "README.md" tables_rets = [] so = SQLiteOper("data/scrap.db") year = get_special_date(delta=0, format="%Y%m") # update main_pie(year) # update weixin,github sources = ["weixin", "github_org", "github_private"] d = { "weixin": "微信公众号", "github_org": "组织github账号", "github_private": "私人github账号" } for source in sources: rets = draw_table(so, top=100, source=source, year=year) if rets: markdown_rets = markdown_table(rets) if markdown_rets: tables_rets.append("# %s 推荐" % d.get(source, source)) for markdown_ret in markdown_rets: tables_rets.append(markdown_ret) tables_rets.append(os.linesep) with codecs.open(fpath, mode='wb') as fr: fr.write("# [数据年报](README_YEAR.md)") fr.write(os.linesep) fr.write('# %s 信息源与信息类型占比' % year) fr.write(os.linesep) fr.write( '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)' .format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format( year=year)) fr.write(os.linesep) fr.write(os.linesep) st = os.linesep.join(tables_rets) fr.write(st) fr.write(os.linesep) fr.write(os.linesep) fr.write('# 日更新程序') fr.write(os.linesep) fr.write('`python update_daily.py`')
def scraw(so, proxy=None, delta=3): """ :param so: :param proxy: :return: """ ts_list = [ get_special_date(delta, format="%Y-%m-%d") for delta in range(0, 0 - delta, -1) ] url = "https://www.sec-wiki.com/?2019-03-04" r = get_request(url) if r: try: soup = BeautifulSoup(r.content, 'lxml') except Exception as e: logging.error("GET %s failed : %s" % (url, repr(e))) return if soup: rows = soup.find_all("span", class_='dropcap') if rows: for row in rows: if row: cur_ts = row.get_text() if cur_ts in ts_list: a = row.next_sibling if a: url = a.get("href") o, ext = parse_url(url) domain = o.netloc cur_ts = re.sub("-", "", cur_ts) title = strip_n(a.get_text()) overview = {} overview['ts'] = cur_ts overview['url'] = url overview['title'] = title overview['domain'] = domain overview["domain_name"] = \ str(get_title(overview["domain"], proxy=proxy)) if overview: sql = d2sql(overview, table="secwiki_today_detail", action="INSERT OR IGNORE ") if sql: try: so.execute(sql) except Exception as e: logging.error( "[secwiki_today_sql]: " "sql(%s) error(%s)" % (sql, str(e))) st = "{ts}\t{url}" \ "\t{title}\t{domain}\t{domain_name}".format( ts=overview.get("ts"), domain=overview.get("domain"), title=overview.get("title"), domain_name=overview.get("domain_name"), url=overview.get("url") ) print st url = overview.get("url") ts = overview.get("ts") tag = overview.get("tag", "") title = overview.get("title") sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: #print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e)))
def scraw(so, proxy=None, delta=2): """ :param so: :param proxy: :return: """ ts_list = [ get_special_date(delta, format="%Y%m%d") for delta in range(0, 0 - delta, -1) ] url = "https://sec.today/pulses/" r = get_request(url) if r: try: soup = BeautifulSoup(r.content, 'lxml') except Exception as e: logging.error("GET %s failed : %s" % (url, repr(e))) return if soup: rows = soup.find_all("div", class_='card-body') if rows: for row in rows: if row: overview = {} card_title = row.find("h5", class_="card-title") if card_title: card_title_text = strip_n(card_title.get_text()) card_title_url = card_title.find( "a", class_="text-dark").get("href") overview["title_english"] = card_title_text sec_url = "https://sec.today%s" % card_title_url url_details = get_redirect_url( sec_url, root_dir="data/sec_url", issql=False, proxy=proxy) #url_details = None if url_details: overview["url"] = url_details.get("url") overview["domain"] = url_details.get("domain") else: overview["url"] = sec_url card_text_chinese = row.find("p", class_="card-text my-1") if card_text_chinese: card_text_chinese = strip_n( card_text_chinese.find("q").get_text()) overview["title"] = card_text_chinese card_text = row.find( "small", class_=re.compile(r"card-subtitle")) if card_text: card_text_domain = strip_n(card_text.get_text()) domain = parse_domain_tag(card_text_domain) if domain: overview["domain"] = domain overview["domain_name"] = str( get_title(overview["domain"], proxy=proxy)) card_text_types = card_text.find_all( "span", class_=re.compile(r"badge-tag")) if card_text_types: tags = [] for card_text_type in card_text_types: card_text_type = strip_n( card_text_type.get_text()) if card_text_type: tags.append(card_text_type) overview["tag"] = ",".join(tags) card_text_ts = row.find("cite") if card_text_ts: card_text_ts = strip_n(card_text_ts.get_text()) domain_ts = parse_sec_today_url(card_text_ts) # print card_text_ts, domain_ts if domain_ts: domain, ts = domain_ts else: ts = get_special_date() overview["ts"] = ts if ts not in ts_list: continue if overview: sql = d2sql(overview, table="xuanwu_today_detail", action="INSERT OR IGNORE ") if sql: try: so.execute(sql) except Exception as e: logging.error( "[sec_total_sql]: sql(%s) error(%s)" % (sql, str(e))) st = "{ts}\t{tag}\t{url}" \ "\t{title}\t{title_english}\t{domain}\t{domain_name}".format( ts=overview.get("ts"), tag=overview.get("tag"), domain=overview.get("domain"), title=overview.get("title"), title_english=overview.get("title_english"), domain_name=overview.get("domain_name"), url=overview.get("url") ) print st # print sql url = overview.get("url") ts = overview.get("ts") tag = overview.get("tag") title = overview.get("title") sql = "" if url.find("://twitter.com") != -1: d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy) if d: sql = d2sql(d, table="twitter") elif url.find("weixin.qq.com") != -1: d = get_weixin_info(url, ts, tag) if d: sql = d2sql(d, table="weixin") elif url.find("//github.com") != -1: d = get_github_info(url, title, ts=ts, tag=tag) if d: sql = d2sql(d, table='github') if sql: try: # print sql so.execute(sql) except Exception as e: logging.error("[sql]: %s %s" % (sql, str(e)))
def draw_readme_item(year=None, fpath=None): """ :param year: :param fpath: :return: """ tables_rets = [] so = SQLiteOper("data/scrap.db") if year is None: year = get_special_date(delta=0, format="%Y%m") if fpath is None: fpath = 'README_%s.md' % year # update main_pie(year) # update weixin,github sources = [ "weixin", "github_org", "github_private", "medium_xuanwu", "medium_secwiki", "zhihu_xuanwu", "zhihu_secwiki" ] d = { "weixin": "微信公众号", "github_org": "组织github账号", "github_private": "私人github账号" } for source in sources: rets = draw_table(so, top=100, source=source, year=year) if rets: markdown_rets = markdown_table(rets) if markdown_rets: tables_rets.append("# %s 推荐" % d.get(source, source)) for markdown_ret in markdown_rets: tables_rets.append(markdown_ret) tables_rets.append(os.linesep) with codecs.open(fpath, mode='wb') as fr: fr.write('# [数据--所有](README_20.md)') fr.write(os.linesep) fr.write( '# [数据--年度](README_{year_year}.md)'.format(year_year=year[0:4])) fr.write(os.linesep) fr.write('# %s 信息源与信息类型占比' % year) fr.write(os.linesep) fr.write( '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)' .format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) # fr.write('![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'. # format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'. format(year=year)) fr.write(os.linesep) fr.write(os.linesep) fr.write( '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format( year=year)) fr.write(os.linesep) fr.write(os.linesep) st = os.linesep.join(tables_rets) fr.write(st) fr.write(os.linesep) fr.write(os.linesep) fr.write('# 日更新程序') fr.write(os.linesep) fr.write('`python update_daily.py`') return fpath