def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):

    def get_title(root):
        result = ''
        try:
            result = root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        except Exception:
            pass
        if result != '':
            return result

        try:
            result = root.xpath('//h3[@class="se_textarea"]/text()')[0].strip()
        except Exception:
            pass
        #return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        return result

    def get_page_html(url):
        try:
            page = requests.get(url, headers=headers)
            root = html.fromstring(page.content)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_, 'lxml'), get_title(root))
        except IOError:
            print ''
            return (None, None)

    #if blog_id.startswith('http'):
    #    url = blog_id
    #else:
    url = mobileurl % (blog_id, log_no)

    (doc, title)    = get_page_html(url)

    if doc:
        crawled_time    = utils.get_today_str()
        crawler_version = utils.get_version()
        #url             = posturl % (blog_id, log_no)
        post_tags       = tags[(blog_id, log_no)]
        directory_seq   = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                crawler_version, title, written_time, url, post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):
    def get_title(root):
        result = ''
        try:
            result = root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        except Exception:
            pass
        if result != '':
            return result

        try:
            result = root.xpath('//h3[@class="se_textarea"]/text()')[0].strip()
        except Exception:
            pass
        #return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()
        return result

    def get_page_html(url):
        try:
            page = requests.get(url, headers=headers)
            root = html.fromstring(page.content)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_, 'lxml'), get_title(root))
        except IOError:
            print ''
            return (None, None)

    #if blog_id.startswith('http'):
    #    url = blog_id
    #else:
    url = mobileurl % (blog_id, log_no)

    (doc, title) = get_page_html(url)

    if doc:
        crawled_time = utils.get_today_str()
        crawler_version = utils.get_version()
        #url             = posturl % (blog_id, log_no)
        post_tags = tags[(blog_id, log_no)]
        directory_seq = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                                  crawler_version, title, written_time, url,
                                  post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
def make_structure(blog_id, log_no, raw, doc, crawled_time, crawler_version,
                                    title, written_time, url, tags, directory_seq,
                                    encoding='utf-8'):

    extract_crawlerTime  = get_today_str()
    #extract_category     = lambda doc: doc.find("a", {"class": "_categoryName"}).get_text().encode(encoding)
    def extract_category(doc):
        doc_node = None
        try:
            doc_node = doc.find("a", {"class": "_categoryName"})
        except Exception:
            pass
        if doc_node == None:
            doc_node = doc.find("a", {"id": "_categoryName"})
        if doc_node == None:
            return ''
        else:
            return doc_node.get_text().encode(encoding)
    extract_content_html = lambda doc: doc.find("div", {"id": "viewTypeSelector"})

    def extract_sympathycount(doc):
        if doc.find("em", {"id": "sympathyCount"}) == None:
            return None
        else:
            return doc.find("em", {"id": "sympathyCount"}).get_text()

    def extract_images(htmls = extract_content_html(doc)):
        image_urls = []
        images = htmls.find_all("span", {"class":"_img _inl fx"})
        for i, image in enumerate(images):
            tmp = images[i]['thumburl'] + 'w2'
            image_urls.append(tmp)
        return image_urls

    return {u"blogId": blog_id,
            u"logNo": log_no,
            u"content": extract_content_html(doc).get_text().encode(encoding),
            u"contentHtml": str(extract_content_html(doc)),
            u"crawledTime": crawled_time,
            u"crawlerVersion": crawler_version,
            u"directorySeq": directory_seq,
            u"title": title,
            u"writtenTime": written_time,
            u"url": url,
            u"tags": tags,
            u"categoryName": extract_category(doc),
            u"sympathyCount": extract_sympathycount(doc),
            u"images": extract_images()}
Ejemplo n.º 4
0
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):

    def get_title(root):
        return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()

    def get_page_html(url):
        try:
            root = html.parse(url)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_), get_title(root))
        except IOError:
            print ''
            return (None, None)

    if blog_id.startswith('http'):
        url = blog_id
    else:
        url = mobileurl % (blog_id, log_no)

    (doc, title)    = get_page_html(url)

    if doc:
        crawled_time    = utils.get_today_str()
        crawler_version = utils.get_version()
        url             = posturl % (blog_id, log_no)
        post_tags       = tags[(blog_id, log_no)]
        directory_seq   = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                crawler_version, title, written_time, url, post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
Ejemplo n.º 5
0
def crawl_blog_post(blog_id, log_no, tags, written_time=None, verbose=True):

    def get_title(root):
        return root.xpath('//h3[@class="tit_h3"]/text()')[0].strip()

    def get_page_html(url):
        try:
            root = html.parse(url)
            elem = root.xpath('//div[@class="_postView"]')[0]
            html_ = etree.tostring(elem)
            return (BeautifulSoup(html_), get_title(root))
        except IOError:
            print ''
            return (None, None)

    if blog_id.startswith('http'):
        url = blog_id
    else:
        url = mobileurl % (blog_id, log_no)

    (doc, title)    = get_page_html(url)

    if doc:
        crawled_time    = utils.get_today_str()
        crawler_version = utils.get_version()
        url             = posturl % (blog_id, log_no)
        post_tags       = tags[(blog_id, log_no)]
        directory_seq   = None  # NOTE: No directory sequence given for query crawler

        post = btc.make_structure(blog_id, log_no, None, doc, crawled_time,
                crawler_version, title, written_time, url, post_tags, directory_seq)
        if not verbose:
            del post['directorySeq']
            del post['sympathyCount']
        return post

    else:
        print 'No doc in %s' % posturl
        return None
Ejemplo n.º 6
0
def make_structure(blog_id,
                   log_no,
                   raw,
                   doc,
                   crawled_time,
                   crawler_version,
                   title,
                   written_time,
                   url,
                   tags,
                   directory_seq,
                   encoding='utf-8'):

    extract_crawlerTime = get_today_str()

    #extract_category     = lambda doc: doc.find("a", {"class": "_categoryName"}).get_text().encode(encoding)
    def extract_category(doc):
        doc_node = None
        try:
            doc_node = doc.find("a", {"class": "_categoryName"})
        except Exception:
            pass
        if doc_node == None:
            doc_node = doc.find("a", {"id": "_categoryName"})
        if doc_node == None:
            return ''
        else:
            return doc_node.get_text().encode(encoding)

    extract_content_html = lambda doc: doc.find("div",
                                                {"id": "viewTypeSelector"})

    def extract_sympathycount(doc):
        if doc.find("em", {"id": "sympathyCount"}) == None:
            return None
        else:
            return doc.find("em", {"id": "sympathyCount"}).get_text()

    def extract_images(htmls=extract_content_html(doc)):
        image_urls = []
        images = htmls.find_all("span", {"class": "_img _inl fx"})
        for i, image in enumerate(images):
            tmp = images[i]['thumburl'] + 'w2'
            image_urls.append(tmp)
        return image_urls

    return {
        u"blogId": blog_id,
        u"logNo": log_no,
        u"content": extract_content_html(doc).get_text().encode(encoding),
        u"contentHtml": str(extract_content_html(doc)),
        u"crawledTime": crawled_time,
        u"crawlerVersion": crawler_version,
        u"directorySeq": directory_seq,
        u"title": title,
        u"writtenTime": written_time,
        u"url": url,
        u"tags": tags,
        u"categoryName": extract_category(doc),
        u"sympathyCount": extract_sympathycount(doc),
        u"images": extract_images()
    }