Exemple #1
0
def getmissdate():
    """
    获取缺失的日期
    :return:
    """

    days = []
    cur_fname_lists = []

    for fname in glob.glob(r'data/xuanwu/*/*/*/index.html'):
        cur_fname_lists.append(fname)
    start, end = getstartendfrompath(cur_fname_lists)

    cur_day = get_special_date()
    print "cur_day({cur_day}), last_day({last_day})".format(cur_day=cur_day,
                                                            last_day=end)
    if end != 0:
        last_day = datetime.datetime.strptime(str(end), '%Y%m%d')
        cur_day = datetime.datetime.strptime(str(cur_day), '%Y%m%d')
        delta = cur_day - last_day
        for i in range(1, delta.days + 1):
            n_day = last_day + datetime.timedelta(days=i)
            n_day = n_day.strftime('%Y%m%d')
            days.append(n_day)
    return days
Exemple #2
0
def update_github():
    """

    :return:
    """
    ts = get_special_date(format="%Y-%m-%d %H:%m:%S")
    cmd = "git add . && git commit -m '%s' && git push origin master" % (ts)

    ret = os.system(cmd)
    if ret != 0:
        print "%s failed" % cmd
Exemple #3
0
def draw_readme(year=None):
    """

    :return:
    """
    if year is None:
        year_month = get_special_date(delta=0, format="%Y%m")
        year_year = get_special_date(delta=0, format="%Y")
        year_all = str(year_year)[0:2]
        for y in [year_month, year_year, year_all]:
            fpath = draw_readme_item(year=y)
            if len(str(y)) == 6:
                fpath_month = fpath
                fpath_default = "README.md"
                shutil.copyfile(fpath_month, fpath_default)

            print(fpath)

    else:
        # 绘制指定月份的消息
        fpath = draw_readme_item(year=year)
        print(fpath)
Exemple #4
0
def draw_readme(fpath=None):
    """

    :return:
    """

    if fpath is None:
        fpath = "README.md"

    tables_rets = []
    so = SQLiteOper("data/scrap.db")
    year = get_special_date(delta=0, format="%Y%m")
    # update
    main_pie(year)

    # update weixin,github
    sources = ["weixin", "github_org", "github_private"]

    d = {
        "weixin": "微信公众号",
        "github_org": "组织github账号",
        "github_private": "私人github账号"
    }

    for source in sources:
        rets = draw_table(so, top=100, source=source, year=year)
        if rets:

            markdown_rets = markdown_table(rets)
            if markdown_rets:
                tables_rets.append("# %s 推荐" % d.get(source, source))
                for markdown_ret in markdown_rets:
                    tables_rets.append(markdown_ret)
                tables_rets.append(os.linesep)

    with codecs.open(fpath, mode='wb') as fr:
        fr.write("# [数据年报](README_YEAR.md)")
        fr.write(os.linesep)
        fr.write('# %s 信息源与信息类型占比' % year)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)'
            .format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write(
            '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format(
                year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        st = os.linesep.join(tables_rets)
        fr.write(st)
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write('# 日更新程序')
        fr.write(os.linesep)
        fr.write('`python update_daily.py`')
def scraw(so, proxy=None, delta=3):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y-%m-%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://www.sec-wiki.com/?2019-03-04"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("span", class_='dropcap')

            if rows:

                for row in rows:

                    if row:

                        cur_ts = row.get_text()
                        if cur_ts in ts_list:
                            a = row.next_sibling
                            if a:
                                url = a.get("href")

                                o, ext = parse_url(url)
                                domain = o.netloc
                                cur_ts = re.sub("-", "", cur_ts)

                                title = strip_n(a.get_text())
                                overview = {}
                                overview['ts'] = cur_ts
                                overview['url'] = url
                                overview['title'] = title
                                overview['domain'] = domain
                                overview["domain_name"] = \
                                    str(get_title(overview["domain"], proxy=proxy))

                                if overview:
                                    sql = d2sql(overview,
                                                table="secwiki_today_detail",
                                                action="INSERT OR IGNORE ")

                                    if sql:
                                        try:
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error(
                                                "[secwiki_today_sql]: "
                                                "sql(%s) error(%s)" %
                                                (sql, str(e)))

                                    st = "{ts}\t{url}" \
                                         "\t{title}\t{domain}\t{domain_name}".format(
                                        ts=overview.get("ts"),
                                        domain=overview.get("domain"),
                                        title=overview.get("title"),
                                        domain_name=overview.get("domain_name"),
                                        url=overview.get("url")
                                    )
                                    print st

                                    url = overview.get("url")
                                    ts = overview.get("ts")
                                    tag = overview.get("tag", "")
                                    title = overview.get("title")

                                    sql = ""

                                    if url.find("://twitter.com") != -1:

                                        d = get_twitter_info(url,
                                                             title,
                                                             ts=ts,
                                                             tag=tag,
                                                             proxy=proxy)

                                        if d:
                                            sql = d2sql(d, table="twitter")

                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, ts, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=ts,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')

                                    if sql:
                                        try:
                                            #print sql
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))
def scraw(so, proxy=None, delta=2):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y%m%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://sec.today/pulses/"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("div", class_='card-body')

            if rows:

                for row in rows:

                    if row:

                        overview = {}

                        card_title = row.find("h5", class_="card-title")

                        if card_title:
                            card_title_text = strip_n(card_title.get_text())
                            card_title_url = card_title.find(
                                "a", class_="text-dark").get("href")
                            overview["title_english"] = card_title_text
                            sec_url = "https://sec.today%s" % card_title_url

                            url_details = get_redirect_url(
                                sec_url,
                                root_dir="data/sec_url",
                                issql=False,
                                proxy=proxy)
                            #url_details = None
                            if url_details:
                                overview["url"] = url_details.get("url")
                                overview["domain"] = url_details.get("domain")
                            else:
                                overview["url"] = sec_url

                        card_text_chinese = row.find("p",
                                                     class_="card-text my-1")
                        if card_text_chinese:
                            card_text_chinese = strip_n(
                                card_text_chinese.find("q").get_text())
                            overview["title"] = card_text_chinese

                        card_text = row.find(
                            "small", class_=re.compile(r"card-subtitle"))
                        if card_text:

                            card_text_domain = strip_n(card_text.get_text())
                            domain = parse_domain_tag(card_text_domain)
                            if domain:
                                overview["domain"] = domain
                                overview["domain_name"] = str(
                                    get_title(overview["domain"], proxy=proxy))

                            card_text_types = card_text.find_all(
                                "span", class_=re.compile(r"badge-tag"))
                            if card_text_types:
                                tags = []
                                for card_text_type in card_text_types:
                                    card_text_type = strip_n(
                                        card_text_type.get_text())
                                    if card_text_type:
                                        tags.append(card_text_type)
                                overview["tag"] = ",".join(tags)

                        card_text_ts = row.find("cite")
                        if card_text_ts:
                            card_text_ts = strip_n(card_text_ts.get_text())
                            domain_ts = parse_sec_today_url(card_text_ts)
                            # print card_text_ts, domain_ts

                            if domain_ts:
                                domain, ts = domain_ts
                            else:
                                ts = get_special_date()

                            overview["ts"] = ts
                            if ts not in ts_list:

                                continue

                        if overview:
                            sql = d2sql(overview,
                                        table="xuanwu_today_detail",
                                        action="INSERT OR IGNORE ")

                            if sql:
                                try:

                                    so.execute(sql)
                                except Exception as e:
                                    logging.error(
                                        "[sec_total_sql]: sql(%s) error(%s)" %
                                        (sql, str(e)))

                            st = "{ts}\t{tag}\t{url}" \
                                 "\t{title}\t{title_english}\t{domain}\t{domain_name}".format(
                                ts=overview.get("ts"),
                                tag=overview.get("tag"),
                                domain=overview.get("domain"),
                                title=overview.get("title"),
                                title_english=overview.get("title_english"),
                                domain_name=overview.get("domain_name"),
                                url=overview.get("url")
                            )
                            print st
                            # print sql

                            url = overview.get("url")
                            ts = overview.get("ts")
                            tag = overview.get("tag")
                            title = overview.get("title")

                            sql = ""

                            if url.find("://twitter.com") != -1:

                                d = get_twitter_info(url,
                                                     title,
                                                     ts=ts,
                                                     tag=tag,
                                                     proxy=proxy)

                                if d:
                                    sql = d2sql(d, table="twitter")

                            elif url.find("weixin.qq.com") != -1:
                                d = get_weixin_info(url, ts, tag)

                                if d:
                                    sql = d2sql(d, table="weixin")
                            elif url.find("//github.com") != -1:
                                d = get_github_info(url, title, ts=ts, tag=tag)

                                if d:
                                    sql = d2sql(d, table='github')

                            if sql:
                                try:
                                    # print sql
                                    so.execute(sql)
                                except Exception as e:
                                    logging.error("[sql]: %s %s" %
                                                  (sql, str(e)))
Exemple #7
0
def draw_readme_item(year=None, fpath=None):
    """

    :param year:
    :param fpath:
    :return:
    """

    tables_rets = []
    so = SQLiteOper("data/scrap.db")
    if year is None:
        year = get_special_date(delta=0, format="%Y%m")

    if fpath is None:
        fpath = 'README_%s.md' % year
    # update
    main_pie(year)

    # update weixin,github
    sources = [
        "weixin", "github_org", "github_private", "medium_xuanwu",
        "medium_secwiki", "zhihu_xuanwu", "zhihu_secwiki"
    ]

    d = {
        "weixin": "微信公众号",
        "github_org": "组织github账号",
        "github_private": "私人github账号"
    }

    for source in sources:
        rets = draw_table(so, top=100, source=source, year=year)
        if rets:

            markdown_rets = markdown_table(rets)
            if markdown_rets:
                tables_rets.append("# %s 推荐" % d.get(source, source))
                for markdown_ret in markdown_rets:
                    tables_rets.append(markdown_ret)
                tables_rets.append(os.linesep)

    with codecs.open(fpath, mode='wb') as fr:
        fr.write('# [数据--所有](README_20.md)')
        fr.write(os.linesep)
        fr.write(
            '# [数据--年度](README_{year_year}.md)'.format(year_year=year[0:4]))
        fr.write(os.linesep)
        fr.write('# %s 信息源与信息类型占比' % year)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-secwiki](data/img/domain/{year}-信息源占比-secwiki.png)'
            .format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息源占比-xuanwu](data/img/domain/{year}-信息源占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        # fr.write('![{year}-信息类型占比-secwiki](data/img/tag/{year}-信息类型占比-secwiki.png)'.
        #        format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)
        fr.write(
            '![{year}-信息类型占比-xuanwu](data/img/tag/{year}-信息类型占比-xuanwu.png)'.
            format(year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write(
            '![{year}-最喜欢语言占比](data/img/language/{year}-最喜欢语言占比.png)'.format(
                year=year))
        fr.write(os.linesep)
        fr.write(os.linesep)

        st = os.linesep.join(tables_rets)
        fr.write(st)
        fr.write(os.linesep)
        fr.write(os.linesep)

        fr.write('# 日更新程序')
        fr.write(os.linesep)
        fr.write('`python update_daily.py`')
    return fpath