Exemple #1
0
def parseAndSave(content,currentWebsite):
    # save_html_content(currentWebsite.id, websiteContents)

    soup = BeautifulSoup(content, 'lxml')

    items = soup.find_all('a')

    print("A Items", len(items))

    COUNT = 0

    if items:
        for a in items:
            if a.string:
                url, text = a.get('href'), a.string.encode('utf-8').strip()

                check_pass = check_content(url, text)

                if check_pass:
                    url = complement_url(url, currentWebsite.url)
                    if url:
                        result = save_info_feed(url, text, currentWebsite.id, currentWebsite.company.id)
                        if result:
                            COUNT += 1

    if COUNT == 0:
        log(NOTICE, "#{id} {name} {site} 没抓到更新 {count} 条".format(id=currentWebsite.company.id,
                                                                 name=currentWebsite.company.name_cn.encode(
                                                                     'utf-8').strip(), site=currentWebsite.url,
                                                                 count=COUNT))
    else:
        log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=currentWebsite.company.id,
                                                                name=currentWebsite.company.name_cn.encode(
                                                                    'utf-8').strip(), site=currentWebsite.url,
                                                                count=COUNT))
def extract(w_id):
    try:
        w = get_website(w_id)
        # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url))

        new_html_content = crawl(w.url)
        if not new_html_content:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        if w.html_content:
            old_html_content = w.html_content.content
        else:
            save_html_content(w.id, new_html_content)
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        diff_text = diff_file(old_html_content, new_html_content)
        if not diff_text:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        save_html_content(w.id, new_html_content)

        soup = BeautifulSoup(diff_text, 'lxml')
        items = soup.find_all('a')
        COUNT = 0
        if items:
            for a in items:
                if a.string:
                    url, text = a.get('href'), a.string
                    check_pass = check_content(url, text)
                    if check_pass:
                        url = complement_url(url, w.url)
                        if url:
                            result = save_info_feed(url, text, w.id, w.company.id)
                            if result:
                                COUNT += 1
                            # log(RECORD, "[name] [+] [{url}  {text}]".format(name=w.company.name_cn, url=url, text=text.strip()))
        if COUNT == 0:
            log(NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))
        else:
            log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))

    except Exception as e:
        try:
            w = get_website(w_id)
            log(ERROR, "#{id} {name} {site} {err}".format(id=w.id, name=w.company.name_cn, site=w.url, err=str(e)))
        except Exception as e:
            log(ERROR, str(e))
Exemple #3
0
def extract(w_id):
    try:
        w = get_website(w_id)
        # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url))

        new_html_content = crawl(w.url)
        if not new_html_content:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        if w.html_content:
            old_html_content = w.html_content.content
        else:
            save_html_content(w.id, new_html_content)
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        diff_text = diff_file(old_html_content, new_html_content)
        if not diff_text:
            log(NOTICE, "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id, name=w.company.name_cn, site=w.url))
            return

        save_html_content(w.id, new_html_content)

        soup = BeautifulSoup(diff_text, 'lxml')
        items = soup.find_all('a')
        COUNT = 0
        if items:
            for a in items:
                if a.string:
                    url, text = a.get('href'), a.string
                    check_pass = check_content(url, text)
                    if check_pass:
                        url = complement_url(url, w.url)
                        if url:
                            result = save_info_feed(url, text, w.id, w.company.id)
                            if result:
                                COUNT += 1
                            # log(RECORD, "[name] [+] [{url}  {text}]".format(name=w.company.name_cn, url=url, text=text.strip()))
        if COUNT == 0:
            log(NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))
        else:
            log(RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(id=w.company.id, name=w.company.name_cn, site=w.url, count=COUNT))

    except Exception as e:
        try:
            w = get_website(w_id)
            log(ERROR, "#{id} {name} {site} {err}".format(id=w.id, name=w.company.name_cn, site=w.url, err=str(e)))
        except Exception as e:
            log(ERROR, str(e))
Exemple #4
0
def extract(w_id):
    """

    :param w_id:
    :return:
    """
    try:
        # 列举出所有没能成功抓取更新的情况,并在log中记录。

        w = get_website(w_id)
        # log(NOTICE, "开始 #{id} {name} {site} ".format(id=w.id, name=w.company.name_cn, site=w.url))
        # Todo 此处尝试调用Scrapy
        new_html_content = crawl(w.url)
        if not new_html_content:
            log(
                NOTICE,
                "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id,
                                                      name=w.company.name_cn,
                                                      site=w.url))
            return

        # if current website 'w' already have html_content. compare it with 'new_content' and save those when 'diff' exist.
        if w.html_content:
            old_html_content = w.html_content.content
        else:
            save_html_content(w.id, new_html_content)
            log(
                NOTICE,
                "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id,
                                                      name=w.company.name_cn,
                                                      site=w.url))
            return
        diff_text = diff_file(old_html_content, new_html_content)
        if not diff_text:
            log(
                NOTICE,
                "#{id} {name} {site} 抓到更新 0 条".format(id=w.company.id,
                                                      name=w.company.name_cn,
                                                      site=w.url))
            return
        save_html_content(w.id, new_html_content)

        # lxml是一个html解析器,与它类似的还有html5lib等。
        soup = BeautifulSoup(diff_text, 'lxml')
        items = soup.find_all('a')
        COUNT = 0

        # 基本逻辑:抓取所有<a href>标签,check内容是否合规,是则该标签的url补全,存入info_feed表中。
        if items:
            for a in items:
                if a.string:
                    url, text = a.get('href'), a.string
                    check_pass = check_content(url, text)
                    if check_pass:
                        url = complement_url(url, w.url)
                        if url:
                            result = save_info_feed(url, text, w.id,
                                                    w.company.id)
                            if result:
                                COUNT += 1
                            # log(RECORD, "[name] [+] [{url}  {text}]".format(name=w.company.name_cn, url=url, text=text.strip()))
        if COUNT == 0:
            log(
                NOTICE, "#{id} {name} {site} 抓到更新 {count} 条".format(
                    id=w.company.id,
                    name=w.company.name_cn,
                    site=w.url,
                    count=COUNT))
        else:
            log(
                RECORD, "#{id} {name} {site} 抓到更新 {count} 条".format(
                    id=w.company.id,
                    name=w.company.name_cn,
                    site=w.url,
                    count=COUNT))

    except Exception as e:
        try:
            w = get_website(w_id)
            log(
                ERROR,
                "#{id} {name} {site} {err}".format(id=w.id,
                                                   name=w.company.name_cn,
                                                   site=w.url,
                                                   err=str(e)))
        except Exception as e:
            log(ERROR, str(e))