Python parse_url Exemples, mills.parse_url Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : xuanwu.py Projet : dizhaung/sec_profile

def parse_item(fname):
    """
    解析单个页面
    :param page:
    :return:
    """

    cur_day = getdatefrompath(fname)

    if cur_day is None:
        return

    if os.path.exists(fname):
        html_hd = open(fname, mode='rb')

        soup = BeautifulSoup(html_hd, "lxml")

        divs = soup.find_all(id='singleweibo')
        for div in divs:

            if div:

                weibo_id = ""

                weibo_author = div.find(id="singleweiboauthor")
                if weibo_author:
                    if weibo_author.p:
                        try:
                            weibo_id = parse_author(weibo_author.p.text, cur_day)
                        except Exception as e:

                            logging.error("[PARSE_AUTHOR_FAILED]: %s %s %s" % (cur_day, str(e), weibo_author.p))

                weibo_body = div.find(id="singleweibobody")
                if weibo_body:
                    try:
                        r = parse_body(weibo_body.p, cur_day)
                        if r:
                            tag = r[0]
                            urls = r[1]
                            title = r[2]
                            if urls:
                                for url in urls:
                                    o, ext = parse_url(url)
                                    domain = o.netloc
                                    url_path = o.path
                                    root_domain = ext.domain + "." + ext.suffix

                                    result = (cur_day, tag, url, title, root_domain, domain, url_path, weibo_id)

                                    yield result
                    except Exception as e:

                        logging.error("[PARSE_BODY_FAILED]: %s %s %s" % (cur_day, str(e), weibo_body.p))

        html_hd.close()

Exemple #2

0

Afficher le fichier

def parse_item(html_hd):
    """
    解析单个页面
    :param page:
    :return:
    """

    soup = BeautifulSoup(html_hd, "lxml")
    # find_day
    #  2014/03/03-2014/03/09
    day = soup.find("blockquote").text
    p = re.compile(r'(\d{4})\/(\d{2})\/(\d{2})')
    m = re.search(p, day)
    if m:
        ts = m.group(1) + m.group(2) + m.group(3)
    else:
        return

    page = soup.find(id="content")

    for div in page.find_all("div", class_='single'):

        sts = div.stripped_strings
        tag = sts.next()
        if tag.find("[") != -1:
            tag = tag[1:-1]

        title = sts.next()

        # url

        for url in div.find_all("a"):
            url = url["href"]
            o, ext = parse_url(url)
            domain = o.netloc
            url_path = o.path
            root_domain = ext.domain + "." + ext.suffix
            result = (ts, tag, url, title, root_domain, domain, url_path)

            yield result

Exemple #3

0

Afficher le fichier

Fichier : secwiki_today.py Projet : gonboyzent/sec_profile

def scraw(so, proxy=None, delta=3):
    """

    :param so:
    :param proxy:
    :return:
    """
    ts_list = [
        get_special_date(delta, format="%Y-%m-%d")
        for delta in range(0, 0 - delta, -1)
    ]

    url = "https://www.sec-wiki.com/?2019-03-04"
    r = get_request(url)
    if r:
        try:
            soup = BeautifulSoup(r.content, 'lxml')

        except Exception as e:
            logging.error("GET %s  failed : %s" % (url, repr(e)))
            return
        if soup:
            rows = soup.find_all("span", class_='dropcap')

            if rows:

                for row in rows:

                    if row:

                        cur_ts = row.get_text()
                        if cur_ts in ts_list:
                            a = row.next_sibling
                            if a:
                                url = a.get("href")

                                o, ext = parse_url(url)
                                domain = o.netloc
                                cur_ts = re.sub("-", "", cur_ts)

                                title = strip_n(a.get_text())
                                overview = {}
                                overview['ts'] = cur_ts
                                overview['url'] = url
                                overview['title'] = title
                                overview['domain'] = domain
                                overview["domain_name"] = \
                                    str(get_title(overview["domain"], proxy=proxy))

                                if overview:
                                    sql = d2sql(overview,
                                                table="secwiki_today_detail",
                                                action="INSERT OR IGNORE ")

                                    if sql:
                                        try:
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error(
                                                "[secwiki_today_sql]: "
                                                "sql(%s) error(%s)" %
                                                (sql, str(e)))

                                    st = "{ts}\t{url}" \
                                         "\t{title}\t{domain}\t{domain_name}".format(
                                        ts=overview.get("ts"),
                                        domain=overview.get("domain"),
                                        title=overview.get("title"),
                                        domain_name=overview.get("domain_name"),
                                        url=overview.get("url")
                                    )
                                    print st

                                    url = overview.get("url")
                                    ts = overview.get("ts")
                                    tag = overview.get("tag", "")
                                    title = overview.get("title")

                                    sql = ""

                                    if url.find("://twitter.com") != -1:

                                        d = get_twitter_info(url,
                                                             title,
                                                             ts=ts,
                                                             tag=tag,
                                                             proxy=proxy)

                                        if d:
                                            sql = d2sql(d, table="twitter")

                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, ts, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=ts,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')

                                    if sql:
                                        try:
                                            #print sql
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))

Exemple #4

0

Afficher le fichier

def parse_item(fname, so=None, proxy=None):
    """
    解析单个页面
    :param page:
    :return:
    """

    cur_day = getdatefrompath(fname)

    if cur_day is None:
        return

    if os.path.exists(fname):
        html_hd = open(fname, mode='rb')

        soup = BeautifulSoup(html_hd, "lxml")

        divs = soup.find_all(id='singleweibo')
        for div in divs:

            if div:

                weibo_id = ""

                weibo_author = div.find(id="singleweiboauthor")
                if weibo_author:
                    if weibo_author.p:
                        try:
                            weibo_id = parse_author(weibo_author.p.text,
                                                    cur_day)
                        except Exception as e:

                            logging.error("[PARSE_AUTHOR_FAILED]: %s %s %s" %
                                          (cur_day, str(e), weibo_author.p))

                weibo_body = div.find(id="singleweibobody")
                if weibo_body:
                    try:
                        r = parse_body(weibo_body.p, cur_day)
                        if r:
                            tag = r[0]
                            urls = r[1]
                            title = r[2]
                            if urls:
                                for url in urls:
                                    o, ext = parse_url(url)
                                    domain = o.netloc
                                    url_path = o.path
                                    root_domain = ext.domain + "." + ext.suffix

                                    result = (cur_day, tag, url, title,
                                              root_domain, domain, url_path,
                                              weibo_id)

                                    title = strip_n(title)
                                    sql = ""

                                    if url.find("://twitter.com") != -1:

                                        d = get_twitter_info(url,
                                                             title,
                                                             ts=cur_day,
                                                             tag=tag,
                                                             proxy=proxy)

                                        if d:
                                            sql = d2sql(d, table="twitter")

                                    elif url.find("weixin.qq.com") != -1:
                                        d = get_weixin_info(url, cur_day, tag)

                                        if d:
                                            sql = d2sql(d, table="weixin")
                                    elif url.find("//github.com") != -1:
                                        d = get_github_info(url,
                                                            title,
                                                            ts=cur_day,
                                                            tag=tag)

                                        if d:
                                            sql = d2sql(d, table='github')
                                    if sql:
                                        try:
                                            so.execute(sql)
                                        except Exception as e:
                                            logging.error("[sql]: %s %s" %
                                                          (sql, str(e)))

                                    yield result
                    except Exception as e:

                        logging.error("[PARSE_BODY_FAILED]: %s %s %s" %
                                      (cur_day, str(e), weibo_body.p))

        html_hd.close()

Exemple #5

0

Afficher le fichier

Fichier : secwiki.py Projet : zss8848/sec_profile

def parse_item(html_hd, so=None, proxy=None):
    """
    解析单个页面
    :param page:
    :return:
    """

    soup = BeautifulSoup(html_hd, "lxml")
    # find_day
    #  2014/03/03-2014/03/09
    day = soup.find("blockquote").text
    p = re.compile(r'(\d{4})\/(\d{2})\/(\d{2})')
    m = re.search(p, day)
    if m:
        ts = m.group(1) + m.group(2) + m.group(3)
    else:
        return

    page = soup.find(id="content")

    for div in page.find_all("div", class_='single'):

        sts = div.stripped_strings
        tag = sts.next()
        if tag.find("[") != -1:
            tag = tag[1:-1]

        title = sts.next()

        # url

        for url in div.find_all("a"):
            url = url["href"]
            o, ext = parse_url(url)
            domain = o.netloc
            url_path = o.path
            root_domain = ext.domain + "." + ext.suffix

            title = strip_n(title)

            domain_name = ""
            try:
                domain_name = get_title(domain)
            except Exception as e:
                logging.error("[get_domain_name]: %s %s" %
                              (domain_name, str(e)))
            if domain_name:

                domain_name = re.sub('\x22', '', domain_name)
                domain_name = re.sub('\x27', '', domain_name)
                update_sql = "update {table} set domain_name='{title}' where domain='{domain}';".format(
                    table="secwiki_detail", title=domain_name, domain=domain)
                try:
                    so.execute(update_sql)
                    print update_sql
                except Exception as e:
                    logging.error("[update_sql]: %s str(%s)" %
                                  (update_sql, str(e)))

            sql = ""

            if url.find("://twitter.com") != -1:

                d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy)

                if d:
                    sql = d2sql(d, table="twitter")

            elif url.find("weixin.qq.com") != -1:
                d = get_weixin_info(url, ts, tag)

                if d:
                    sql = d2sql(d, table="weixin")
            elif url.find("//github.com") != -1:
                d = get_github_info(url, title, ts=ts, tag=tag)

                if d:
                    sql = d2sql(d, table='github')

            if sql:
                try:
                    print sql
                    so.execute(sql)
                except Exception as e:
                    logging.error("[sql]: %s %s" % (sql, str(e)))

            result = (ts, tag, url, title, root_domain, domain, url_path)

            yield result