Ejemplo n.º 1
0
 def start_requests(self) -> scrapy.Request:
     # Init saved index and next index sequence
     self.init_index()
     # Iterates all key words
     for keyword in self.todo_list:
         yield scrapy.Request(self.enytry_url.format(key=quote(keyword)),
                              callback=self.parse, meta={'key': keyword})
     logger.info('All weibo data has been collected.')
Ejemplo n.º 2
0
 def process_weibo(item: WeiboItem) -> None:
     name = item['name']
     item['weibo_list'] = [dict(weibo) for weibo in item['weibo_list']]
     if not os.path.exists(WEIBO_DIR):
         os.mkdir(WEIBO_DIR)
     filename = os.path.join(WEIBO_DIR, '{}.txt'.format(name))
     with open(filename, 'w') as f:
         json.dump(dict(item),
                   f,
                   ensure_ascii=False,
                   separators=(',', ':'),
                   indent=4)
     logger.info('Weibo items about {} has been fetched.'.format(name))
Ejemplo n.º 3
0
 def process_bilibili(item: BilibiliItem) -> None:
     name = item['name']
     item['video_list'] = [dict(video) for video in item['video_list']]
     if not os.path.exists(BILIBILI_DIR):
         os.mkdir(BILIBILI_DIR)
     filename = os.path.join(BILIBILI_DIR, '{}.txt'.format(name))
     with open(filename, 'w') as f:
         json.dump(dict(item),
                   f,
                   ensure_ascii=False,
                   separators=(',', ':'),
                   indent=4)
     logger.info('Bilibili videos about {} has been fetched.'.format(name))
Ejemplo n.º 4
0
 def process_jiki(item: JikiItem) -> None:
     name = item['name']
     index = item['index']
     if not os.path.exists(JIKI_DIR):
         os.mkdir(JIKI_DIR)
     filename = os.path.join(JIKI_DIR, '{}_{}.txt'.format(index, name))
     with open(filename, 'w') as f:
         json.dump(dict(item),
                   f,
                   ensure_ascii=False,
                   separators=(',', ': '),
                   indent=4)
     logger.info('Jiki entry {} has been fetched.'.format(name))
Ejemplo n.º 5
0
 def __init__(self):
     options = Options()
     # Set window invisible
     if not SHOW_WINDOW:
         options.add_argument('--headless')
     driver = webdriver.Chrome(executable_path=DRIVER_PATH,
                               chrome_options=options)
     # Set window size and position
     driver.set_window_position(100 * random(), 100 * random())
     driver.set_window_size(250, 150)
     # Set timeout parameters
     driver.set_page_load_timeout(DOWNLOAD_TIMEOUT)
     self.driver = driver
     self.wait = WebDriverWait(driver,
                               timeout=DOWNLOAD_TIMEOUT,
                               poll_frequency=POLL_FREQUENCY)
     self.like_rate = 0.4  # Possibility of clicking `like` button
     logger.info('Selenium driver is starting...')
Ejemplo n.º 6
0
 def close(self, spider, reason):
     # Dump save record before close spider
     logger.info(reason)
     with open(JIKI_INDEX_FILE, 'wb') as f:
         pickle.dump(self.saved_dict, f)
Ejemplo n.º 7
0
 def close(self, spider, reason):
     logger.info(reason)
     with open(BILIBILI_INDEX_FILE, 'wb') as f:
         pickle.dump(self.saved_dict, f)