コード例 #1
0
def scraw():

    url = "https://sec.today/pulses/"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            pass
コード例 #2
0
def scraw(so, proxy=None, delta=3):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y-%m-%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://www.sec-wiki.com/?2019-03-04"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("span", class_='dropcap')

            if rows:

                for row in rows:

                    if row:

                        cur_ts = row.get_text()
                        if cur_ts in ts_list:
                            a = row.next_sibling
                            if a:
                                url = a.get("href")

                                o, ext = parse_url(url)
                                domain = o.netloc
                                cur_ts = re.sub("-", "", cur_ts)

                                title = strip_n(a.get_text())
                                overview = {}
                                overview['ts'] = cur_ts
                                overview['url'] = url
                                overview['title'] = title
                                overview['domain'] = domain
                                overview["domain_name"] = \
                                    str(get_title(overview["domain"], proxy=proxy))

                                if overview:
                                    sql = d2sql(overview,
                                                table="secwiki_today_detail",
                                                action="INSERT OR IGNORE ")

                                    if sql:
                                        try:
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error(
                                                "[secwiki_today_sql]: "
                                                "sql(%s) error(%s)" %
                                                (sql, str(e)))

                                    st = "{ts}\t{url}" \
                                         "\t{title}\t{domain}\t{domain_name}".format(
                                        ts=overview.get("ts"),
                                        domain=overview.get("domain"),
                                        title=overview.get("title"),
                                        domain_name=overview.get("domain_name"),
                                        url=overview.get("url")
                                    )
                                    print st

                                    url = overview.get("url")
                                    ts = overview.get("ts")
                                    tag = overview.get("tag", "")
                                    title = overview.get("title")

                                    sql = ""

                                    if url.find("://twitter.com") != -1:

                                        d = get_twitter_info(url,
                                                             title,
                                                             ts=ts,
                                                             tag=tag,
                                                             proxy=proxy)

                                        if d:
                                            sql = d2sql(d, table="twitter")

                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, ts, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=ts,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')

                                    if sql:
                                        try:
                                            #print sql
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))
コード例 #3
0
def scraw(so, proxy=None, delta=2):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y%m%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://sec.today/pulses/"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("div", class_='card-body')

            if rows:

                for row in rows:

                    if row:

                        overview = {}

                        card_title = row.find("h5", class_="card-title")

                        if card_title:
                            card_title_text = strip_n(card_title.get_text())
                            card_title_url = card_title.find(
                                "a", class_="text-dark").get("href")
                            overview["title_english"] = card_title_text
                            sec_url = "https://sec.today%s" % card_title_url

                            url_details = get_redirect_url(
                                sec_url,
                                root_dir="data/sec_url",
                                issql=False,
                                proxy=proxy)
                            #url_details = None
                            if url_details:
                                overview["url"] = url_details.get("url")
                                overview["domain"] = url_details.get("domain")
                            else:
                                overview["url"] = sec_url

                        card_text_chinese = row.find("p",
                                                     class_="card-text my-1")
                        if card_text_chinese:
                            card_text_chinese = strip_n(
                                card_text_chinese.find("q").get_text())
                            overview["title"] = card_text_chinese

                        card_text = row.find(
                            "small", class_=re.compile(r"card-subtitle"))
                        if card_text:

                            card_text_domain = strip_n(card_text.get_text())
                            domain = parse_domain_tag(card_text_domain)
                            if domain:
                                overview["domain"] = domain
                                overview["domain_name"] = str(
                                    get_title(overview["domain"], proxy=proxy))

                            card_text_types = card_text.find_all(
                                "span", class_=re.compile(r"badge-tag"))
                            if card_text_types:
                                tags = []
                                for card_text_type in card_text_types:
                                    card_text_type = strip_n(
                                        card_text_type.get_text())
                                    if card_text_type:
                                        tags.append(card_text_type)
                                overview["tag"] = ",".join(tags)

                        card_text_ts = row.find("cite")
                        if card_text_ts:
                            card_text_ts = strip_n(card_text_ts.get_text())
                            domain_ts = parse_sec_today_url(card_text_ts)
                            # print card_text_ts, domain_ts

                            if domain_ts:
                                domain, ts = domain_ts
                            else:
                                ts = get_special_date()

                            overview["ts"] = ts
                            if ts not in ts_list:

                                continue

                        if overview:
                            sql = d2sql(overview,
                                        table="xuanwu_today_detail",
                                        action="INSERT OR IGNORE ")

                            if sql:
                                try:

                                    so.execute(sql)
                                except Exception as e:
                                    logging.error(
                                        "[sec_total_sql]: sql(%s) error(%s)" %
                                        (sql, str(e)))

                            st = "{ts}\t{tag}\t{url}" \
                                 "\t{title}\t{title_english}\t{domain}\t{domain_name}".format(
                                ts=overview.get("ts"),
                                tag=overview.get("tag"),
                                domain=overview.get("domain"),
                                title=overview.get("title"),
                                title_english=overview.get("title_english"),
                                domain_name=overview.get("domain_name"),
                                url=overview.get("url")
                            )
                            print st
                            # print sql

                            url = overview.get("url")
                            ts = overview.get("ts")
                            tag = overview.get("tag")
                            title = overview.get("title")

                            sql = ""

                            if url.find("://twitter.com") != -1:

                                d = get_twitter_info(url,
                                                     title,
                                                     ts=ts,
                                                     tag=tag,
                                                     proxy=proxy)

                                if d:
                                    sql = d2sql(d, table="twitter")

                            elif url.find("weixin.qq.com") != -1:
                                d = get_weixin_info(url, ts, tag)

                                if d:
                                    sql = d2sql(d, table="weixin")
                            elif url.find("//github.com") != -1:
                                d = get_github_info(url, title, ts=ts, tag=tag)

                                if d:
                                    sql = d2sql(d, table='github')

                            if sql:
                                try:
                                    # print sql
                                    so.execute(sql)
                                except Exception as e:
                                    logging.error("[sql]: %s %s" %
                                                  (sql, str(e)))