コード例 #1
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_score_details(response: TypeResponse):
    detail = {
        'total':
        response.xpath(
            '//*[@id="ChartWarpper"]/div/small/span/text()').extract_first()
    }
    for li in response.xpath(
            '//*[@id="ChartWarpper"]/ul[@class="horizontalChart"]/li'):
        detail[li.xpath('.//span[@class="label"]/text()').extract_first(
        )] = li.xpath('.//span[@class="count"]/text()').extract_first()[1:-1]
    return detail
コード例 #2
0
 def parse(self, response: TypeResponse):
     yield Request(
         'https://mirror.bgm.rin.cat/group/topic/350626',
         callback=self.parse_topic,
     )
     return
     for item in response.xpath('//*[@id="eden_tpc_list"]/ul/li'):
         url = item.xpath('./a/@href').extract_first().replace(
             '/rakuen/topic/group/', '/group/topic/')
         yield Request(
             response.urljoin(url),
             callback=self.parse_topic,
         )
コード例 #3
0
    def parse(self, response: TypeResponse):

        links = set()
        for link in response.xpath(
                '//*[@id="wikiEntryMainTab"]//li/a/@href').extract():
            links.add(link)
        for link in response.xpath(
                '//*[@id="latestEntryMainTab"]//li/a/@href').extract():
            links.add(link)

        for link in links:
            if '/subject/' in link:
                yield Request(response.urljoin(link),
                              callback=self.parse_page,
                              meta={'dont_cache': True})
コード例 #4
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_image(response: TypeResponse):
    not_nsfw_cover = response.xpath(
        '//*[@id="bangumiInfo"]/div/div/a/img/@src')
    if not_nsfw_cover:
        return not_nsfw_cover.extract_first().replace(
            '//lain.bgm.tv/pic/cover/c/', 'lain.bgm.tv/pic/cover/g/')
    else:
        return 'lain.bgm.tv/img/no_icon_subject.png'
コード例 #5
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_info(response: TypeResponse):
    info = defaultdict(list)

    for info_el in response.xpath('//*[@id="infobox"]/li'):
        info[info_el.xpath('span/text()').extract_first().replace(
            ': ', '')] = info_el.xpath('a/text()').extract() or info_el.xpath(
                'text()').extract()

    return dict(info)
コード例 #6
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_tag_from_response(response: TypeResponse, subject_id):
    for a in response.xpath(
            '//*[@id="subject_detail"]//div[@class="subject_tag_section"]/div[@class="inner"]/a'
    ):
        text = a.xpath('span/text()').extract_first()
        if not text:
            continue
        yield TagItem(subject_id=subject_id,
                      text=text,
                      count=int(a.xpath('small/text()').extract_first()))
コード例 #7
0
    def parse_topic(self, response: TypeResponse):
        topic = TopicItem()
        topic['last_reply'] = max(
            parse_datetime(x.xpath('./text()').extract_first())
            for x in response.xpath('//*[contains(@class,"re_info")]/small'))

        e = response.xpath('//*[contains(@class, "topic_content")]')
        post_topic = response.xpath('//*[contains(@class, "postTopic")]')
        topic['id'] = response.url.split('/')[-1]
        topic['content'] = parse_content(e)
        topic['group'] = response.xpath(
            '//*[@id="pageHeader"]/h1/span/a[1]/@href').extract_first().split(
                '/')[-1]
        topic['title'] = response.xpath(
            '//*[@id="pageHeader"]/h1/text()').extract_first()
        topic['author'] = post_topic.xpath(
            './div[contains(@class, "inner")]//a/@href').extract_first()
        if not topic['author']:
            raise KeyError('no author')
        else:
            topic['author'] = topic['author'].split('/')[-1]
        create_time = post_topic.xpath(
            './div[contains(@class, "re_info")]/small/text()').extract_first()
        topic['create_time'] = parse_datetime(create_time)

        comments = response.xpath('//*[@id="comment_list"]')
        last_reply = topic['create_time']
        for row in comments.xpath('./div[contains(@class, "row_reply")]'):
            for item in parse_row_reply(response, row, topic['id']):
                if item['create_time'] > last_reply:
                    last_reply = item['create_time']
                yield item
            # print(m_r)
        topic['last_reply'] = last_reply
        yield topic
コード例 #8
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
    def parse(self, response: TypeResponse):
        subject_id = int(response.url.split('/')[-1])
        if '出错了' not in response.text:
            subject_item = SubjectItem()
            if '条目已锁定' in response.text:
                subject_item['id'] = subject_id
                subject_item['locked'] = True

            subject_type = response.xpath(
                '//*[@id="panelInterestWrapper"]//div[contains(@class, '
                '"global_score")]'
                '/div/small[contains(@class, "grey")]/text()').extract_first()

            subject_item['subject_type'] = subject_type.split()[1]
            subject_item['id'] = int(response.url.split('/')[-1])

            subject_item['info'] = get_info(response)
            subject_item['tags'] = 'tags'
            yield from get_tag_from_response(response, subject_id)
            subject_item['image'] = get_image(response)
            subject_item['score'] = get_score(response)
            subject_item['score_details'] = get_score_details(response)

            title = response.xpath('//*[@id="headerSubject"]/h1/a')[0]

            subject_item['name_cn'] = title.attrib['title']
            subject_item['name'] = title.xpath('text()').extract_first()

            # this will set 'wishes', 'done', 'doings', 'on_hold', 'dropped'
            subject_item.update(get_collector_count(response))

            for edge in get_relation(response, source=subject_item['id']):
                relation_item = RelationItem(**edge, )
                yield relation_item
                # yield Request(url_from_id(relation_item['target']))
            yield subject_item
コード例 #9
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_collector_count(response: TypeResponse):
    item = {}
    for key, value in collector.items():
        item[key] = response.xpath(
            '//*[@id="subjectPanelCollect"]/span[@class="tip_i"]'
            '/a[re:test(@href, "{}$")]/text()'.format(value),
            namespaces={
                're': regexpNS
            }).extract_first()

    for key in collector:
        if item[key]:
            item[key] = int(item[key].split('人')[0])
        else:
            item[key] = 0
    return item
コード例 #10
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_relation(response: TypeResponse, source):
    section = response.xpath(
        '//div[@class="subject_section"][//h2[@class="subtitle" and contains('
        'text(), "关联条目")]]'
        '/div[@class="content_inner"]/ul/li')
    relation = []
    chunk_list = []  # type:List[TypeSelectorList]

    for li in section:
        if 'sep' in li.attrib.get('class', ''):
            chunk_list.append([li])
        else:
            chunk_list[-1].append(li)
    for li_list in chunk_list:
        rel = li_list[0].xpath('span/text()').extract_first()
        for li in li_list:
            target = li.xpath('a/@href').extract_first()
            relation.append({
                'source': source,
                'target': int(target.split('/')[-1]),
                'relation': rel,
            })
    return relation
コード例 #11
0
ファイル: bgm_tv.py プロジェクト: pleiadesian/bgm-ip-viewer
def get_score(response: TypeResponse):
    return response.xpath(
        '//*[@id="panelInterestWrapper"]//div[@class="global_score"]/span['
        '1]/text()').extract_first()