Ejemplo n.º 1
0
 def __init__(self):
     self.hashs = HashTool()
     self.start_urls = ['https://www.qiushibaike.com/history/page/1/']
     for i in range(2, 14):
         self.start_urls.append(
             'https://www.qiushibaike.com/history/page/{}/'.format(i))
     print(self.start_urls)
Ejemplo n.º 2
0
def get_page(i):
    hashs = HashTool()
    headers = {
        # 'GET' :'/article/list/text?count=30&page=1 HTTP/1.1',
        'Host': 'm2.qiushibaike.com',
        # 'Source':'ios_11.8.0',
        # 'Accept':'*/*',
        # 'app':'1',
        'Uuid': 'ios_1e111a1d65d34295b21321e18671b97e',
        # 'screen':'414,736',
        # 'qbaid':'D5B0B037-0DF9-410C-B3E9-4C1AB8A55C6B',
        # 'User-Agent':'QiuBai/11.8.0 rv:31 (iPhone; iOS 11.1; zh_CN) PLHttpClient/1_WIFI',
        # 'Accept-Language':'zh-Hans-CN;q=1',
        # 'Accept-Encoding':'br, gzip, deflate',
        # 'Connection':'keep-alive'
    }
    url = 'https://119.29.47.97/article/list/text?count=30&page=' + str(i)
    a = requests.get(url=url, headers=headers, verify=False)
    b = json.loads(a.text)
    items = b['items']

    for item in items:
        item_dic = {}
        formats = item['format']
        content = item['content']
        comments = item['comments_count']
        id = item['id']
        down = item['votes']['down']
        up = item['votes']['up']
        # user = item['user']['login']
        # astrology = item['user']['astrology']
        # user_id = item['user']['uid']
        # gender = item['user']['gender']
        # age = item['user']['age']
        share_count = item['share_count']
        item_dic['tag'] = formats
        item_dic['content'] = content
        item_dic['comments'] = comments
        # item_dic['types'] = types
        item_dic['oid'] = id
        item_dic['unlikes'] = -down
        item_dic['likes'] = up
        item_dic['title'] = ''
        item_dic['shares'] = share_count
        item_dic['url'] = 'https://www.qiushibaike.com/article/' + str(id)
        item_dic['simhash'] = hashs.get_hash(content)
        item_dic['platform'] = 1
        item_dic['weight'] = 0
        print(content)
        # item_dic['astrology'] = astrology
        # item_dic['user_id'] = user_id
        # item_dic['gender'] = gender
        # item_dic['age'] = age
        save_sql(item_dic)
Ejemplo n.º 3
0
class QiuBaiFreshSpider(scrapy.Spider):
    name = 'qbfreshspider'

    def __init__(self):
        self.hashs = HashTool()
        self.start_urls = ['https://www.qiushibaike.com/textnew/page/1/']
        for i in range(2, 32):
            self.start_urls.append(
                'https://www.qiushibaike.com/textnew/page/{}/'.format(i))
        print(self.start_urls)

    def parse(self, response):
        flag = 1
        print('运行到这里了')
        print(response.body)
        bodys = response.xpath(
            '//div[@class="article block untagged mb15"]').extract()
        print(bodys)
        i = 0
        for body in bodys:
            i = i + 1
            content = re.findall('<div class="content">.*?<span>(.*?)</span>',
                                 body, re.S)
            actor = re.findall('<h2>(.*?)</h2>', body, re.S)
            id = re.findall('<a href="/article/(.*?)" target="_blank" ', body,
                            re.S)
            item = ShortnewsInfoItem()
            item['oid'] = id[0]
            item['title'] = actor[0].replace('\n', '')
            try:
                item['content'] = content[0].replace('\n', '')
            except:
                item['content'] = '暂无'
            item['platform'] = flag
            item['tag'] = 'Fresh'
            item['url'] = 'https://www.qiushibaike.com/article/' + item['id']
            item['simhash'] = self.hashs.get_hash(item['content'])
            print(item)
            yield item
Ejemplo n.º 4
0
class QiuBai24HourSpider(scrapy.Spider):
    name = 'qb24hourspider'

    def __init__(self):
        self.hashs = HashTool()
        self.start_urls = ['https://www.qiushibaike.com/hot/page/1/']
        for i in range(2, 14):
            self.start_urls.append(
                'https://www.qiushibaike.com/hot/page/{}/'.format(i))
        print(self.start_urls)

    def parse(self, response):
        bodys = response.xpath(
            '//div[re:test(@id,"qiushi_tag_.*?")]').extract()
        flag = 1
        i = 0
        for body in bodys:
            i = i + 1
            content = re.findall('<div class="content">\n<span>(.*?)</span>',
                                 body, re.S)

            actor = re.findall('<h2>(.*?)</h2>', body, re.S)
            id = re.findall('<a href="/article/(.*?)" target="_blank" ', body,
                            re.S)
            item = ShortnewsInfoItem()
            item['oid'] = id[0]
            item['title'] = actor[0].replace('\n', '')
            try:
                item['content'] = content[0].replace('\n', '')
            except:
                item['content'] = '暂无'
            item['platform'] = flag
            item['tag'] = '24hours'
            item['url'] = 'https://www.qiushibaike.com/article/' + item['id']
            item['simhash'] = self.hashs.get_hash(item['content'])
            print(item)
            yield item
        print(i)