Ejemplo n.º 1
0
def extract_hot_weibo(doc):
    try:
        scripts = util.extract_script(doc)
        print len(scripts)
        script = util.select_script(scripts,
                                    r'"domid":"Pl_Core_NewMixFeed__3"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        divs = html.xpath('//div[@action-type="feed_list_item"]')
        weibos = []
        for div in divs:
            try:
                weibo = Weibo()
                weibo["mid"] = div.attrib.get("mid")
                _div = div.xpath('.//a[@class="W_texta W_fb"]')[0]
                usercard = _div.attrib.get("usercard")
                end = usercard.index("&")
                weibo["uid"] = usercard[len("id="):end]
                link = div.xpath('.//*[@class="feed_from W_textb"]/a')[0]
                weibo["url"] = link.attrib.get("href")
                extract_content(div, weibo)
                extract_date_source(div, weibo)
                weibos.append(weibo)
            except:
                traceback.print_exc()

        return weibos
    except:
        traceback.print_exc()
Ejemplo n.º 2
0
def extract_user_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts,
                                    r'"domid":"Pl_Official_PersonalInfo__63"')
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__62"')
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__61"')
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__59"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)

        lis = html.xpath(r'//ul/li')
        info = []
        for li in lis:
            text = li.xpath("string()")
            info.append(util.clean_text(text))
        level_info = extract_level_info(doc)
        if level_info:
            info.append(level_info)
        return info
    except:
        traceback.print_exc()
        return None
Ejemplo n.º 3
0
def extract_user_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(
            scripts, r'"domid":"Pl_Official_PersonalInfo__63"'
            )
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__62"'
            )
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__61"'
            )
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__59"'
            )
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)

        lis = html.xpath(r'//ul/li')
        info = []
        for li in lis:
            text = li.xpath("string()")
            info.append(util.clean_text(text))
        level_info = extract_level_info(doc)
        if level_info:
            info.append(level_info)
        return info
    except:
        traceback.print_exc()
        return None
Ejemplo n.º 4
0
def extract_searched_weibo(doc, page_num=None):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts, r'"pid":"pl_weibo_direct"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        divs = html.xpath('//div[@action-type="feed_list_item"]')
        weibos = []
        for div in divs:
            try:
                weibo = Weibo()
                weibo["mid"] = div.attrib.get("mid")
                _div = div.xpath('.//a[@class="W_texta W_fb"]')[0]
                usercard = _div.attrib.get("usercard")
                end = usercard.index("&")
                weibo["uid"] = usercard[len("id="):end]
                weibos.append(weibo)
            except:
                traceback.print_exc()

        if page_num:
            try:
                lis = html.xpath(r'//span[@class="list"]/div/ul/li')
                text = lis[-1].xpath(".//text()")[0]
                total = int(text[1:-1])
                return weibos, total
            except:
                return weibos, 1
        else:
            return weibos
    except:
        if page_num:
            return None, None
        else:
            return None
Ejemplo n.º 5
0
 def extract_content_html(self, html):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     script = util.select_script(scripts, r'pl.content.homefeed.index')
     text = script.text.strip()
     return util.extract_html_from_script(text)
Ejemplo n.º 6
0
 def extract_content_html(self, html):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     script = util.select_script(
         scripts, r'pl.content.homefeed.index'
     )
     text = script.text.strip()
     return util.extract_html_from_script(text)
Ejemplo n.º 7
0
def extract_topic(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, '"domid":"v6_pl_rightmod_recominfo"')
    text = script.text.strip()
    doc = util.extract_html_from_script(text)
    html = etree.HTML(doc)
    links = html.xpath('//ul[@class="hot_topic"]/li//a')
    topics = []
    for link in links:
        topics.append((link.attrib["href"], link.text.strip()))
    return topics
Ejemplo n.º 8
0
def extract_topic(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, '"domid":"v6_pl_rightmod_recominfo"')
    text = script.text.strip()
    doc = util.extract_html_from_script(text)
    html = etree.HTML(doc)
    links = html.xpath('//ul[@class="hot_topic"]/li//a')
    topics = []
    for link in links:
        topics.append((link.attrib["href"], link.text.strip()))
    return topics
Ejemplo n.º 9
0
def extract_level_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts,
                                    r'"domid":"Pl_Official_RightGrowNew')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        p = html.xpath(r'//p[@class="level_info"]')
        if p:
            text = p[0].xpath("string()")
            info = util.clean_text(text)
        return info
    except:
        traceback.print_exc()
        return None
Ejemplo n.º 10
0
def extract_level_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(
            scripts, r'"domid":"Pl_Official_RightGrowNew'
        )
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        p = html.xpath(r'//p[@class="level_info"]')
        if p:
            text = p[0].xpath("string()")
            info = util.clean_text(text)
        return info
    except:
        traceback.print_exc()
        return None
Ejemplo n.º 11
0
def extract_relation(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, r'pl.content.followTab.index')
    html = util.extract_html_from_script(script.text.strip())
    html = etree.HTML(html)
    datas = html.xpath(r'.//ul[@class="follow_list"]/li/@action-data')
    for data in datas:
        try:
            followee = {}
            splits = data.split("&")
            for split in splits:
                _splits = split.split("=")
                followee[_splits[0]] = _splits[1]
            yield followee
        except:
            traceback.print_exc()
            continue
Ejemplo n.º 12
0
def extract_relation(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, r'pl.content.followTab.index')
    html = util.extract_html_from_script(script.text.strip())
    html = etree.HTML(html)
    datas = html.xpath(r'.//ul[@class="follow_list"]/li/@action-data')
    for data in datas:
        try:
            followee = {}
            splits = data.split("&")
            for split in splits:
                _splits = split.split("=")
                followee[_splits[0]] = _splits[1]
            yield followee
        except:
            traceback.print_exc()
            continue
Ejemplo n.º 13
0
 def extract_content_html(self, html, single=False):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     if not single:
         script = util.select_script(
             scripts, r'"domid":"Pl_Official_MyProfileFeed'
         )
         if not script:
             script = util.select_script(
                 scripts, r'"domid":"v6_pl_content_homefeed"'
             )
     else:
         script = util.select_script(
             scripts, r'pl.content.weiboDetail.index'
         )
     text = script.text.strip()
     return util.extract_html_from_script(text)
Ejemplo n.º 14
0
 def extract_content_html(self, html, single=False, hot=False):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     if hot:
         script = util.select_script(scripts,
                                     r'"domid":"Pl_Core_NewMixFeed__3"')
     elif not single:
         script = util.select_script(scripts,
                                     r'"domid":"Pl_Official_MyProfileFeed')
         if not script:
             script = util.select_script(
                 scripts, r'"domid":"v6_pl_content_homefeed"')
     else:
         script = util.select_script(scripts,
                                     r'pl.content.weiboDetail.index')
     text = script.text.strip()
     return util.extract_html_from_script(text)
Ejemplo n.º 15
0
def extract_inbox_comment(data):
    comments = []
    try:
        scripts = util.extract_script(data)
        script = util.select_script(scripts,
                                    r'"domid":"v6_pl_content_commentlist"')
        text = script.text.strip()
        doc = util.extract_html_from_script(text)
        html = etree.HTML(doc)
        divs = html.xpath('//div[@node-type="feed_commentList_comment"]')
    except:
        return comments

    for div in divs:
        try:
            weibo_url, comment = extract_individual_comment(div)
            comments.append((weibo_url, comment))
        except:
            pass
    return comments
Ejemplo n.º 16
0
def extract_inbox_comment(data):
    comments = []
    try:
        scripts = util.extract_script(data)
        script = util.select_script(
                    scripts, r'"domid":"v6_pl_content_commentlist"'
        )
        text = script.text.strip()
        doc = util.extract_html_from_script(text)
        html = etree.HTML(doc)
        divs = html.xpath('//div[@node-type="feed_commentList_comment"]')
    except:
        return comments

    for div in divs:
        try:
            weibo_url, comment = extract_individual_comment(div)
            comments.append((weibo_url, comment))
        except:
            pass
    return comments
Ejemplo n.º 17
0
def extract_searched_weibo(doc, page_num=None):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts, r'"pid":"pl_weibo_direct"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        divs = html.xpath('//div[@action-type="feed_list_item"]')
        weibos = []
        for div in divs:
            try:
                weibo = Weibo()
                weibo["mid"] = div.attrib.get("mid")
                _div = div.xpath('.//a[@class="W_texta W_fb"]')[0]
                usercard = _div.attrib.get("usercard")
                end = usercard.index("&")
                weibo["uid"] = usercard[len("id="):end]
                link = div.xpath('.//*[@class="feed_from W_textb"]/a')[0]
                weibo["url"] = link.attrib.get("href")
                extract_content(div, weibo)
                extract_date_source(div, weibo)
                weibos.append(weibo)
            except:
                traceback.print_exc()

        if page_num:
            try:
                lis = html.xpath(r'//span[@class="list"]/div/ul/li')
                text = lis[-1].xpath(".//text()")[0]
                total = int(text[1:-1])
                return weibos, total
            except:
                return weibos, 1
        else:
            return weibos
    except:
        if page_num:
            return None, None
        else:
            return None