コード例 #1
0
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):

    def get_title(root):
        result = ''
        try:
            result = root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        except Exception:
            pass
        if result != '':
            return result

        try:
            result = root.xpath('//h3[@class="se_textarea"]/text()')[0].strip()
        except Exception:
            pass
        #return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        return result

    def get_page_html(url):
        try:
            page = requests.get(url, headers=headers)
            root = html.fromstring(page.content)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_, 'lxml'), get_title(root))
        except IOError:
            print ''
            return (None, None)

    #if blog_id.startswith('http'):
    #    url = blog_id
    #else:
    url = mobileurl % (blog_id, log_no)

    (doc, title)    = get_page_html(url)

    if doc:
        crawled_time    = utils.get_today_str()
        crawler_version = utils.get_version()
        #url             = posturl % (blog_id, log_no)
        post_tags       = tags[(blog_id, log_no)]
        directory_seq   = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                crawler_version, title, written_time, url, post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
コード例 #2
0
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):
    def get_title(root):
        result = ''
        try:
            result = root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        except Exception:
            pass
        if result != '':
            return result

        try:
            result = root.xpath('//h3[@class="se_textarea"]/text()')[0].strip()
        except Exception:
            pass
        #return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        return result

    def get_page_html(url):
        try:
            page = requests.get(url, headers=headers)
            root = html.fromstring(page.content)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_, 'lxml'), get_title(root))
        except IOError:
            print ''
            return (None, None)

    #if blog_id.startswith('http'):
    #    url = blog_id
    #else:
    url = mobileurl % (blog_id, log_no)

    (doc, title) = get_page_html(url)

    if doc:
        crawled_time = utils.get_today_str()
        crawler_version = utils.get_version()
        #url             = posturl % (blog_id, log_no)
        post_tags = tags[(blog_id, log_no)]
        directory_seq = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                                  crawler_version, title, written_time, url,
                                  post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
コード例 #3
0
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):

    def get_title(root):
        return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()

    def get_page_html(url):
        try:
            root = html.parse(url)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_), get_title(root))
        except IOError:
            print ''
            return (None, None)

    if blog_id.startswith('http'):
        url = blog_id
    else:
        url = mobileurl % (blog_id, log_no)

    (doc, title)    = get_page_html(url)

    if doc:
        crawled_time    = utils.get_today_str()
        crawler_version = utils.get_version()
        url             = posturl % (blog_id, log_no)
        post_tags       = tags[(blog_id, log_no)]
        directory_seq   = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                crawler_version, title, written_time, url, post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
コード例 #4
0
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):

    def get_title(root):
        return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()

    def get_page_html(url):
        try:
            root = html.parse(url)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_), get_title(root))
        except IOError:
            print ''
            return (None, None)

    if blog_id.startswith('http'):
        url = blog_id
    else:
        url = mobileurl % (blog_id, log_no)

    (doc, title)    = get_page_html(url)

    if doc:
        crawled_time    = utils.get_today_str()
        crawler_version = utils.get_version()
        url             = posturl % (blog_id, log_no)
        post_tags       = tags[(blog_id, log_no)]
        directory_seq   = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                crawler_version, title, written_time, url, post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None