Beispiel #1
0
def extract_user_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(
            scripts, r'"domid":"Pl_Official_PersonalInfo__63"'
            )
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__62"'
            )
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__61"'
            )
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__59"'
            )
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)

        lis = html.xpath(r'//ul/li')
        info = []
        for li in lis:
            text = li.xpath("string()")
            info.append(util.clean_text(text))
        level_info = extract_level_info(doc)
        if level_info:
            info.append(level_info)
        return info
    except:
        traceback.print_exc()
        return None
Beispiel #2
0
def extract_hot_weibo(doc):
    try:
        scripts = util.extract_script(doc)
        print len(scripts)
        script = util.select_script(scripts,
                                    r'"domid":"Pl_Core_NewMixFeed__3"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        divs = html.xpath('//div[@action-type="feed_list_item"]')
        weibos = []
        for div in divs:
            try:
                weibo = Weibo()
                weibo["mid"] = div.attrib.get("mid")
                _div = div.xpath('.//a[@class="W_texta W_fb"]')[0]
                usercard = _div.attrib.get("usercard")
                end = usercard.index("&")
                weibo["uid"] = usercard[len("id="):end]
                link = div.xpath('.//*[@class="feed_from W_textb"]/a')[0]
                weibo["url"] = link.attrib.get("href")
                extract_content(div, weibo)
                extract_date_source(div, weibo)
                weibos.append(weibo)
            except:
                traceback.print_exc()

        return weibos
    except:
        traceback.print_exc()
Beispiel #3
0
def extract_user_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts,
                                    r'"domid":"Pl_Official_PersonalInfo__63"')
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__62"')
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__61"')
        if script is None:
            script = util.select_script(
                scripts, r'"domid":"Pl_Official_PersonalInfo__59"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)

        lis = html.xpath(r'//ul/li')
        info = []
        for li in lis:
            text = li.xpath("string()")
            info.append(util.clean_text(text))
        level_info = extract_level_info(doc)
        if level_info:
            info.append(level_info)
        return info
    except:
        traceback.print_exc()
        return None
Beispiel #4
0
def extract_searched_weibo(doc, page_num=None):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts, r'"pid":"pl_weibo_direct"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        divs = html.xpath('//div[@action-type="feed_list_item"]')
        weibos = []
        for div in divs:
            try:
                weibo = Weibo()
                weibo["mid"] = div.attrib.get("mid")
                _div = div.xpath('.//a[@class="W_texta W_fb"]')[0]
                usercard = _div.attrib.get("usercard")
                end = usercard.index("&")
                weibo["uid"] = usercard[len("id="):end]
                weibos.append(weibo)
            except:
                traceback.print_exc()

        if page_num:
            try:
                lis = html.xpath(r'//span[@class="list"]/div/ul/li')
                text = lis[-1].xpath(".//text()")[0]
                total = int(text[1:-1])
                return weibos, total
            except:
                return weibos, 1
        else:
            return weibos
    except:
        if page_num:
            return None, None
        else:
            return None
Beispiel #5
0
def extract_search_result_count(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, r'"pid":"pl_weibo_direct"')
    text = script.text.strip()
    text = text.encode("utf-8", "ignore")
    print text
    p = re.compile("找到(\d+)条结果")
    print p.search(text)
Beispiel #6
0
 def extract_content_html(self, html):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     script = util.select_script(scripts, r'pl.content.homefeed.index')
     text = script.text.strip()
     return util.extract_html_from_script(text)
Beispiel #7
0
 def extract_content_html(self, html):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     script = util.select_script(
         scripts, r'pl.content.homefeed.index'
     )
     text = script.text.strip()
     return util.extract_html_from_script(text)
Beispiel #8
0
def extract_topic(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, '"domid":"v6_pl_rightmod_recominfo"')
    text = script.text.strip()
    doc = util.extract_html_from_script(text)
    html = etree.HTML(doc)
    links = html.xpath('//ul[@class="hot_topic"]/li//a')
    topics = []
    for link in links:
        topics.append((link.attrib["href"], link.text.strip()))
    return topics
Beispiel #9
0
def extract_topic(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, '"domid":"v6_pl_rightmod_recominfo"')
    text = script.text.strip()
    doc = util.extract_html_from_script(text)
    html = etree.HTML(doc)
    links = html.xpath('//ul[@class="hot_topic"]/li//a')
    topics = []
    for link in links:
        topics.append((link.attrib["href"], link.text.strip()))
    return topics
Beispiel #10
0
def extract_level_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts,
                                    r'"domid":"Pl_Official_RightGrowNew')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        p = html.xpath(r'//p[@class="level_info"]')
        if p:
            text = p[0].xpath("string()")
            info = util.clean_text(text)
        return info
    except:
        traceback.print_exc()
        return None
Beispiel #11
0
def extract_level_info(doc):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(
            scripts, r'"domid":"Pl_Official_RightGrowNew'
        )
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        p = html.xpath(r'//p[@class="level_info"]')
        if p:
            text = p[0].xpath("string()")
            info = util.clean_text(text)
        return info
    except:
        traceback.print_exc()
        return None
Beispiel #12
0
def extract_relation(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, r'pl.content.followTab.index')
    html = util.extract_html_from_script(script.text.strip())
    html = etree.HTML(html)
    datas = html.xpath(r'.//ul[@class="follow_list"]/li/@action-data')
    for data in datas:
        try:
            followee = {}
            splits = data.split("&")
            for split in splits:
                _splits = split.split("=")
                followee[_splits[0]] = _splits[1]
            yield followee
        except:
            traceback.print_exc()
            continue
Beispiel #13
0
def extract_relation(doc):
    scripts = util.extract_script(doc)
    script = util.select_script(scripts, r'pl.content.followTab.index')
    html = util.extract_html_from_script(script.text.strip())
    html = etree.HTML(html)
    datas = html.xpath(r'.//ul[@class="follow_list"]/li/@action-data')
    for data in datas:
        try:
            followee = {}
            splits = data.split("&")
            for split in splits:
                _splits = split.split("=")
                followee[_splits[0]] = _splits[1]
            yield followee
        except:
            traceback.print_exc()
            continue
Beispiel #14
0
 def extract_content_html(self, html, single=False):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     if not single:
         script = util.select_script(
             scripts, r'"domid":"Pl_Official_MyProfileFeed'
         )
         if not script:
             script = util.select_script(
                 scripts, r'"domid":"v6_pl_content_homefeed"'
             )
     else:
         script = util.select_script(
             scripts, r'pl.content.weiboDetail.index'
         )
     text = script.text.strip()
     return util.extract_html_from_script(text)
Beispiel #15
0
 def extract_content_html(self, html, single=False, hot=False):
     """
     Extracting html code that contains weibo content.
     """
     scripts = util.extract_script(html)
     if hot:
         script = util.select_script(scripts,
                                     r'"domid":"Pl_Core_NewMixFeed__3"')
     elif not single:
         script = util.select_script(scripts,
                                     r'"domid":"Pl_Official_MyProfileFeed')
         if not script:
             script = util.select_script(
                 scripts, r'"domid":"v6_pl_content_homefeed"')
     else:
         script = util.select_script(scripts,
                                     r'pl.content.weiboDetail.index')
     text = script.text.strip()
     return util.extract_html_from_script(text)
Beispiel #16
0
def extract_inbox_comment(data):
    comments = []
    try:
        scripts = util.extract_script(data)
        script = util.select_script(scripts,
                                    r'"domid":"v6_pl_content_commentlist"')
        text = script.text.strip()
        doc = util.extract_html_from_script(text)
        html = etree.HTML(doc)
        divs = html.xpath('//div[@node-type="feed_commentList_comment"]')
    except:
        return comments

    for div in divs:
        try:
            weibo_url, comment = extract_individual_comment(div)
            comments.append((weibo_url, comment))
        except:
            pass
    return comments
Beispiel #17
0
def extract_user(doc, page_num=None):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts, r'"pid":"pl_user_feedList"')
        json_data = re.findall("\(({.*})\)", script.text)[0]
        json_data = json.loads(json_data)
        html = etree.HTML(json_data["html"])
        divs = html.xpath(r'//div[@class="list_person clearfix"]')
        users = []
        for div in divs:
            try:
                user = {}
                detail = div.xpath(r'.//div[@class="person_detail"]')[0]
                _as = detail.xpath(r'.//p[@class="person_name"]/a')
                if len(_as) >= 1:
                    user["uid"] = _as[0].attrib.get("uid")
                    user["nick"] = _as[0].attrib.get("title")
                    user["home_url"] = _as[0].attrib.get("href")
                    if len(_as) > 1:
                        if _as[1].attrib.get("alt") is not None:
                            user["verify"] = _as[1].attrib.get("alt")

                users.append(user)
            except:
                traceback.print_exc()
                continue
        if page_num:
            try:
                lis = html.xpath(r'//span[@class="list"]/div/ul/li')
                text = lis[-1].xpath(".//text()")[0]
                total = int(text[1:-1])
                return users, total
            except:
                return users, 1
        else:
            return users
    except:
        if page_num:
            return None, None
        else:
            return None
Beispiel #18
0
def extract_user(doc, page_num=None):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts, r'"pid":"pl_user_feedList"')
        json_data = re.findall("\(({.*})\)", script.text)[0]
        json_data = json.loads(json_data)
        html = etree.HTML(json_data["html"])
        divs = html.xpath(r'//div[@class="list_person clearfix"]')
        users = []
        for div in divs:
            try:
                user = {}
                detail = div.xpath(r'.//div[@class="person_detail"]')[0]
                _as = detail.xpath(r'.//p[@class="person_name"]/a')
                if len(_as) >= 1:
                    user["uid"] = _as[0].attrib.get("uid")
                    user["nick"] = _as[0].attrib.get("title")
                    user["home_url"] = _as[0].attrib.get("href")
                    if len(_as) > 1:
                        if _as[1].attrib.get("alt") is not None:
                            user["verify"] = _as[1].attrib.get("alt")

                users.append(user)
            except:
                traceback.print_exc()
                continue
        if page_num:
            try:
                lis = html.xpath(r'//span[@class="list"]/div/ul/li')
                text = lis[-1].xpath(".//text()")[0]
                total = int(text[1:-1])
                return users, total
            except:
                return users, 1
        else:
            return users
    except:
        if page_num:
            return None, None
        else:
            return None
Beispiel #19
0
def extract_inbox_comment(data):
    comments = []
    try:
        scripts = util.extract_script(data)
        script = util.select_script(
                    scripts, r'"domid":"v6_pl_content_commentlist"'
        )
        text = script.text.strip()
        doc = util.extract_html_from_script(text)
        html = etree.HTML(doc)
        divs = html.xpath('//div[@node-type="feed_commentList_comment"]')
    except:
        return comments

    for div in divs:
        try:
            weibo_url, comment = extract_individual_comment(div)
            comments.append((weibo_url, comment))
        except:
            pass
    return comments
Beispiel #20
0
def extract_searched_weibo(doc, page_num=None):
    try:
        scripts = util.extract_script(doc)
        script = util.select_script(scripts, r'"pid":"pl_weibo_direct"')
        html = util.extract_html_from_script(script.text.strip())
        html = etree.HTML(html)
        divs = html.xpath('//div[@action-type="feed_list_item"]')
        weibos = []
        for div in divs:
            try:
                weibo = Weibo()
                weibo["mid"] = div.attrib.get("mid")
                _div = div.xpath('.//a[@class="W_texta W_fb"]')[0]
                usercard = _div.attrib.get("usercard")
                end = usercard.index("&")
                weibo["uid"] = usercard[len("id="):end]
                link = div.xpath('.//*[@class="feed_from W_textb"]/a')[0]
                weibo["url"] = link.attrib.get("href")
                extract_content(div, weibo)
                extract_date_source(div, weibo)
                weibos.append(weibo)
            except:
                traceback.print_exc()

        if page_num:
            try:
                lis = html.xpath(r'//span[@class="list"]/div/ul/li')
                text = lis[-1].xpath(".//text()")[0]
                total = int(text[1:-1])
                return weibos, total
            except:
                return weibos, 1
        else:
            return weibos
    except:
        if page_num:
            return None, None
        else:
            return None