def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): def run(self): try: super(ErrorThread, self).run() except Exception as e: self.err = e else: self.err = None # force import check by reinitalising global vars in html.py reload(pandas.io.html) filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread1.start() helper_thread2.start() while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err
import sys import time import urllib import numpy as np from bs4 import BeautifulSoup from openpyxl import Workbook from pandas.compat import reload reload(sys) # Some User Agents hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \ { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \ {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}] def book_spider(book_tag): page_num = 0 book_list = [] try_times = 0 while (1): url = 'http://www.douban.com/tag/' + urllib.request.quote( book_tag) + '/book?start=' + str(page_num * 15) #time.sleep(np.random.rand() * 5) # Last Version try: req = urllib.request.Request(url, headers=hds[page_num % len(hds)]) source_code = urllib.request.urlopen(req).read()