Example #1
0
 def __init__(self, name, thread=5, interval=1):
     super().__init__()
     self.name = name
     self.db = db
     sc = SimpyderConfig()
     sc.USER_AGENT = FAKE_UA
     sc.PARSE_THREAD_NUMER = thread
     sc.LOG_LEVEL = "INFO"
     sc.DOWNLOAD_INTERVAL = interval
     self.set_config(sc)
Example #2
0
                    '$position': 0
                }
            }
        }, True)
        item['data']['mid'] = item['mid']
        db.author_data.replace_one(
            {
                'mid': item['data']['mid'],
                'datetime': item['data']['datetime']
            },
            item['data'],
            upsert=True)
        if 'object_id' in item:
            self.sentCallBack(item['object_id'], db['user_record'])
        return (item)


s = BiliobAuthorSpider("旧作者爬虫")
sc = SimpyderConfig()
sc.PARSE_THREAD_NUMER = 1
sc.LOG_LEVEL = "INFO"
sc.USER_AGENT = FAKE_UA
sc.DOWNLOAD_INTERVAL = 0.15
s.set_config(sc)

coll = db['author']
if __name__ == "__main__":
    s.config.LOG_LEVEL == 'DEBUG'

    s.run()
Example #3
0
            item['tag_list'] = [None]
        if db.video.find_one({'bvid': item['bvid']},
                             {'bvid': item['bvid']}) != None:
            db.video.update_one(
                {'bvid': item['bvid']},
                {'$set': {
                    'aid': item['aid'],
                    'tag': item['tag_list']
                }},
                upsert=True)
        else:
            db.video.update_one(
                {'aid': item['aid']},
                {'$set': {
                    'bvid': item['bvid'],
                    'tag': item['tag_list']
                }},
                upsert=True)
        return item


s = BiliobTagSpider("标签爬虫")

sc = SimpyderConfig()
sc.PARSE_THREAD_NUMER = 8
sc.LOG_LEVEL = "INFO"
sc.USER_AGENT = FAKE_UA
s.set_config(sc)
if __name__ == "__main__":
    s.run()
Example #4
0
    date = datetime.utcnow()
    items = []
    for d in data:
      try:
        title = d.xpath('div[@class="HotList-itemTitle"]/text()')[0]
        value = int(d.xpath(
            'div[@class="HotList-itemMetrics"]/text()')[0].rstrip('万热度'))
        items.append([title, value, date])
      except Exception as e:
        self.logger.exception(e)
    return items

  def save(self, item):
    for e in item:
      db.zhihu.hot.insert({
          'title': e[0],
          'value': e[1],
          'date': e[2]
      })
    return e


if __name__ == "__main__":
  s = HotSearchSpider("知乎热搜")
  sc = SimpyderConfig()
  sc.COOKIE = FAKE_UA
  sc.DOWNLOAD_INTERVAL = 600
  sc.PARSE_THREAD_NUMER = 1
  s.set_config(sc)
  s.run()
Example #5
0
            'cReply': item['current_reply'],
            'cJannchie': item['current_jannchie'],
            'cDatetime': item['current_datetime'],
            'author': item['author'],
            'subChannel': item['subChannel'],
            'channel': item['channel'],
            'mid': item['mid'],
            'pic': item['pic'],
            'title': item['title'],
            'datetime': datetime.datetime.fromtimestamp(
                item['datetime'])
        },
        '$push': {
            'data': {
                '$each': [item['data']],
                '$position': 0
            }
        }
    }, True)
    if 'object_id' in item:
      self.sentCallBack(item['object_id'], db['user_record'])
    return item


if __name__ == "__main__":
  s = BiliobVideoSpider("biliob-video-spider")
  sc = SimpyderConfig()
  sc.USER_AGENT = FAKE_UA
  s.set_config(sc)
  s.run()
Example #6
0
      data.append({
          'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
          'title': title,
          'value': int(point)
      })
    except Exception as e:
      print('[ERROR]'.format(title))
  return data


f = csv.DictWriter(open('./zhihu.csv', 'w', encoding='utf-8-sig'),
                   fieldnames=['date', 'title', 'value'])


def save(items):
  for item in items:
    f.writerow(item)
  pass


s = Spider()
s.assemble(get_url, parse, save)

sc = SimpyderConfig()
sc.PARSE_THREAD_NUMER = 1
sc.COOKIE = cookie
sc.USER_AGENT = FAKE_UA
s.set_config(sc)
s.run()
pass
Example #7
0
import datetime
from db import db
from simpyder import Spider, FAKE_UA, SimpyderConfig


class SiteInfoSpider(Spider):
    def gen_url(self):
        yield 'https://api.bilibili.com/x/web-interface/online'

    def parse(self, res):
        return res.json()['data']

    def save(self, item):
        item['datetime'] = datetime.datetime.utcnow() + datetime.timedelta(
            hours=8)
        db.site_info.insert_one(item)
        return item


if __name__ == "__main__":
    s = SiteInfoSpider("site-info")
    sc = SimpyderConfig()
    sc.PARSE_THREAD_NUMER = 1
    sc.DOWNLOAD_INTERVAL = 10
    sc.LOG_LEVEL = "DEBUG"
    sc.USER_AGENT = FAKE_UA
    sc.COOKIE = ''
    s.set_config(sc)
    s.run()
Example #8
0
'''
这是一个DEMO。该程序用于爬取B站AV号小于100的视频页面标题
'''

import requests
from simpyder import Spider
from simpyder import SimpyderConfig


def gen_url():
    for each_id in range(100):
        yield "https://www.bilibili.com/video/av{}".format(each_id)


def parse(response):
    return response.xpath('//meta[@name="title"]/@content')[0]


def save(item):
    print(item)
    pass


if __name__ == "__main__":
    s1 = Spider("BILIBILI TITLE SPIDER", gen_url, parse, save)
    sc = SimpyderConfig()
    sc.COOKIES = "example:value;"
    sc.USER_AGENT = "my user agent"
    s1.assemble(gen_url=gen_url, parse=parse, save=save, config=sc)
    s1.run()