Ejemplo n.º 1
0
def scraw(so, proxy=None, delta=3):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y-%m-%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://www.sec-wiki.com/?2019-03-04"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("span", class_='dropcap')

            if rows:

                for row in rows:

                    if row:

                        cur_ts = row.get_text()
                        if cur_ts in ts_list:
                            a = row.next_sibling
                            if a:
                                url = a.get("href")

                                o, ext = parse_url(url)
                                domain = o.netloc
                                cur_ts = re.sub("-", "", cur_ts)

                                title = strip_n(a.get_text())
                                overview = {}
                                overview['ts'] = cur_ts
                                overview['url'] = url
                                overview['title'] = title
                                overview['domain'] = domain
                                overview["domain_name"] = \
                                    str(get_title(overview["domain"], proxy=proxy))

                                if overview:
                                    sql = d2sql(overview,
                                                table="secwiki_today_detail",
                                                action="INSERT OR IGNORE ")

                                    if sql:
                                        try:
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error(
                                                "[secwiki_today_sql]: "
                                                "sql(%s) error(%s)" %
                                                (sql, str(e)))

                                    st = "{ts}\t{url}" \
                                         "\t{title}\t{domain}\t{domain_name}".format(
                                        ts=overview.get("ts"),
                                        domain=overview.get("domain"),
                                        title=overview.get("title"),
                                        domain_name=overview.get("domain_name"),
                                        url=overview.get("url")
                                    )
                                    print st

                                    url = overview.get("url")
                                    ts = overview.get("ts")
                                    tag = overview.get("tag", "")
                                    title = overview.get("title")

                                    sql = ""

                                    if url.find("://twitter.com") != -1:

                                        d = get_twitter_info(url,
                                                             title,
                                                             ts=ts,
                                                             tag=tag,
                                                             proxy=proxy)

                                        if d:
                                            sql = d2sql(d, table="twitter")

                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, ts, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=ts,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')

                                    if sql:
                                        try:
                                            #print sql
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))
Ejemplo n.º 2
0
def parse_item(fname, so=None, proxy=None):
    """
    解析单个页面
    :param page:
    :return:
    """

    cur_day = getdatefrompath(fname)

    if cur_day is None:
        return

    if os.path.exists(fname):
        html_hd = open(fname, mode='rb')

        soup = BeautifulSoup(html_hd, "lxml")

        divs = soup.find_all(id='singleweibo')
        for div in divs:

            if div:

                weibo_id = ""

                weibo_author = div.find(id="singleweiboauthor")
                if weibo_author:
                    if weibo_author.p:
                        try:
                            weibo_id = parse_author(weibo_author.p.text,
                                                    cur_day)
                        except Exception as e:

                            logging.error("[PARSE_AUTHOR_FAILED]: %s %s %s" %
                                          (cur_day, str(e), weibo_author.p))

                weibo_body = div.find(id="singleweibobody")
                if weibo_body:
                    try:
                        r = parse_body(weibo_body.p, cur_day)
                        if r:
                            tag = r[0]
                            urls = r[1]
                            title = r[2]
                            if urls:
                                for url in urls:
                                    o, ext = parse_url(url)
                                    domain = o.netloc
                                    url_path = o.path
                                    root_domain = ext.domain + "." + ext.suffix

                                    result = (cur_day, tag, url, title,
                                              root_domain, domain, url_path,
                                              weibo_id)

                                    title = strip_n(title)
                                    sql = ""

                                    if url.find("://twitter.com") != -1:

                                        d = get_twitter_info(url,
                                                             title,
                                                             ts=cur_day,
                                                             tag=tag,
                                                             proxy=proxy)

                                        if d:
                                            sql = d2sql(d, table="twitter")

                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, cur_day, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=cur_day,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')
                                    if sql:
                                        try:
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))

                                    yield result
                    except Exception as e:

                        logging.error("[PARSE_BODY_FAILED]: %s %s %s" %
                                      (cur_day, str(e), weibo_body.p))

        html_hd.close()
Ejemplo n.º 3
0
def scraw(so, proxy=None, delta=2):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y%m%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://sec.today/pulses/"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("div", class_='card-body')

            if rows:

                for row in rows:

                    if row:

                        overview = {}

                        card_title = row.find("h5", class_="card-title")

                        if card_title:
                            card_title_text = strip_n(card_title.get_text())
                            card_title_url = card_title.find(
                                "a", class_="text-dark").get("href")
                            overview["title_english"] = card_title_text
                            sec_url = "https://sec.today%s" % card_title_url

                            url_details = get_redirect_url(
                                sec_url,
                                root_dir="data/sec_url",
                                issql=False,
                                proxy=proxy)
                            #url_details = None
                            if url_details:
                                overview["url"] = url_details.get("url")
                                overview["domain"] = url_details.get("domain")
                            else:
                                overview["url"] = sec_url

                        card_text_chinese = row.find("p",
                                                     class_="card-text my-1")
                        if card_text_chinese:
                            card_text_chinese = strip_n(
                                card_text_chinese.find("q").get_text())
                            overview["title"] = card_text_chinese

                        card_text = row.find(
                            "small", class_=re.compile(r"card-subtitle"))
                        if card_text:

                            card_text_domain = strip_n(card_text.get_text())
                            domain = parse_domain_tag(card_text_domain)
                            if domain:
                                overview["domain"] = domain
                                overview["domain_name"] = str(
                                    get_title(overview["domain"], proxy=proxy))

                            card_text_types = card_text.find_all(
                                "span", class_=re.compile(r"badge-tag"))
                            if card_text_types:
                                tags = []
                                for card_text_type in card_text_types:
                                    card_text_type = strip_n(
                                        card_text_type.get_text())
                                    if card_text_type:
                                        tags.append(card_text_type)
                                overview["tag"] = ",".join(tags)

                        card_text_ts = row.find("cite")
                        if card_text_ts:
                            card_text_ts = strip_n(card_text_ts.get_text())
                            domain_ts = parse_sec_today_url(card_text_ts)
                            # print card_text_ts, domain_ts

                            if domain_ts:
                                domain, ts = domain_ts
                            else:
                                ts = get_special_date()

                            overview["ts"] = ts
                            if ts not in ts_list:

                                continue

                        if overview:
                            sql = d2sql(overview,
                                        table="xuanwu_today_detail",
                                        action="INSERT OR IGNORE ")

                            if sql:
                                try:

                                    so.execute(sql)
                                except Exception as e:
                                    logging.error(
                                        "[sec_total_sql]: sql(%s) error(%s)" %
                                        (sql, str(e)))

                            st = "{ts}\t{tag}\t{url}" \
                                 "\t{title}\t{title_english}\t{domain}\t{domain_name}".format(
                                ts=overview.get("ts"),
                                tag=overview.get("tag"),
                                domain=overview.get("domain"),
                                title=overview.get("title"),
                                title_english=overview.get("title_english"),
                                domain_name=overview.get("domain_name"),
                                url=overview.get("url")
                            )
                            print st
                            # print sql

                            url = overview.get("url")
                            ts = overview.get("ts")
                            tag = overview.get("tag")
                            title = overview.get("title")

                            sql = ""

                            if url.find("://twitter.com") != -1:

                                d = get_twitter_info(url,
                                                     title,
                                                     ts=ts,
                                                     tag=tag,
                                                     proxy=proxy)

                                if d:
                                    sql = d2sql(d, table="twitter")

                            elif url.find("weixin.qq.com") != -1:
                                d = get_weixin_info(url, ts, tag)

                                if d:
                                    sql = d2sql(d, table="weixin")
                            elif url.find("//github.com") != -1:
                                d = get_github_info(url, title, ts=ts, tag=tag)

                                if d:
                                    sql = d2sql(d, table='github')

                            if sql:
                                try:
                                    # print sql
                                    so.execute(sql)
                                except Exception as e:
                                    logging.error("[sql]: %s %s" %
                                                  (sql, str(e)))
Ejemplo n.º 4
0
def parse_item(html_hd, so=None, proxy=None):
    """
    解析单个页面
    :param page:
    :return:
    """

    soup = BeautifulSoup(html_hd, "lxml")
    # find_day
    #  2014/03/03-2014/03/09
    day = soup.find("blockquote").text
    p = re.compile(r'(\d{4})\/(\d{2})\/(\d{2})')
    m = re.search(p, day)
    if m:
        ts = m.group(1) + m.group(2) + m.group(3)
    else:
        return

    page = soup.find(id="content")

    for div in page.find_all("div", class_='single'):

        sts = div.stripped_strings
        tag = sts.next()
        if tag.find("[") != -1:
            tag = tag[1:-1]

        title = sts.next()

        # url

        for url in div.find_all("a"):
            url = url["href"]
            o, ext = parse_url(url)
            domain = o.netloc
            url_path = o.path
            root_domain = ext.domain + "." + ext.suffix

            title = strip_n(title)

            domain_name = ""
            try:
                domain_name = get_title(domain)
            except Exception as e:
                logging.error("[get_domain_name]: %s %s" %
                              (domain_name, str(e)))
            if domain_name:

                domain_name = re.sub('\x22', '', domain_name)
                domain_name = re.sub('\x27', '', domain_name)
                update_sql = "update {table} set domain_name='{title}' where domain='{domain}';".format(
                    table="secwiki_detail", title=domain_name, domain=domain)
                try:
                    so.execute(update_sql)
                    print update_sql
                except Exception as e:
                    logging.error("[update_sql]: %s str(%s)" %
                                  (update_sql, str(e)))

            sql = ""

            if url.find("://twitter.com") != -1:

                d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy)

                if d:
                    sql = d2sql(d, table="twitter")

            elif url.find("weixin.qq.com") != -1:
                d = get_weixin_info(url, ts, tag)

                if d:
                    sql = d2sql(d, table="weixin")
            elif url.find("//github.com") != -1:
                d = get_github_info(url, title, ts=ts, tag=tag)

                if d:
                    sql = d2sql(d, table='github')

            if sql:
                try:
                    print sql
                    so.execute(sql)
                except Exception as e:
                    logging.error("[sql]: %s %s" % (sql, str(e)))

            result = (ts, tag, url, title, root_domain, domain, url_path)

            yield result