def install_ampl(filename, **kwargs): if installed('ampl'): return dir = filename.replace('.tgz', '') url = 'http://ampl.com/demo/' + filename install_dir = kwargs.get('install_dir', opt_dir) with Downloader(kwargs.get('download_dir', '.')).download(url) as f: with closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(install_dir) add_to_path(os.path.join(install_dir, dir, 'ampl')) add_to_path(os.path.join(install_dir, dir, 'ampl.lic'))
def search(keyword): D = Downloader() url = 'https://www.google.com/search?q=' + urllib.quote_plus(keyword) html = D(url) tree = lxml.html.fromstring(html) links = [] for result in tree.cssselect('h3.r a'): link = result.get('href') qs = urlparse.urlparse(link).query links.extend(urlparse.parse_qs(qs).get('q', [])) return links
def install_maven(**kwargs): if installed('mvn'): return # 3.2.5 is the most recent version of Maven compatible with Java 6. dir = 'apache-maven-3.2.5' url = 'http://mirrors.sonic.net/apache/maven/maven-3/3.2.5/binaries/{0}-bin.tar.gz'.format( dir) install_dir = kwargs.get('install_dir', opt_dir) with Downloader(kwargs.get('download_dir', '.')).download(url) as f: with closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(install_dir) add_to_path(os.path.join(install_dir, dir, 'bin', 'mvn'))
def threaded_crawler(seed_url, delay=5, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60, scrape_callback=None, cache=None): """Crawl this website in multiple threads """ #crawl_queue = Queue.deque([seed_url]) crawl_queue = [seed_url] seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) print link if link not in seen: seen.add(link) crawl_queue.append(link) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None): crawl_queue = Queue.deque([seed_url]) seen = {seed_url: 0} num_urls = 0 # rp = get_robots(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache) thrtl = throttle.Throttle(delay) headers = headers or {} if user_agent: headers['User-agent'] = user_agent while crawl_queue: url = crawl_queue.pop() if True: # rp.can_fetch(user_agent, url): print url thrtl.wait(url) html = D(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) depth = seen[url] if depth != max_depth: if link_regex: links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) if link not in seen: seen[link] = depth + 1 # if same_domain(seed_url, link): crawl_queue.append(link) num_urls += 1 if num_urls == max_urls: break else: print 'Blocked by robots.txt:', url
def test_rule(url, regexp=''): download = Downloader() html1 = download.get(url) #print html1 text1 = process_selector(selector, html1.text) md51 = md5(text1.encode('utf-8')) html2 = download.get(url) text2 = process_selector(selector, html2.text) md52 = md5(text2.encode('utf-8')) if md51 == md52: print 'md5 is same' else: print md51, md52
def download(url='', title='', artist='', gender='', album=''): cleanMp3s() url = request.form['url'] title = request.form['title'] artist = request.form['artist'] gender = request.form['gender'] album = request.form['album'] downloader = Downloader(url, title, artist, gender, album) try: path = downloader.download() except IOError as e: return str(e) return send_from_directory(os.path.abspath('.'), path, as_attachment=True)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl this website in multiple threads """ # the queue of URL's that still need to be crawled #crawl_queue = Queue.deque([seed_url]) crawl_queue = [seed_url] # the URL's that have been seen seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: # crawl queue is empty break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen.add(link) # add this new link to queue crawl_queue.append(link) # wait for all download threads to finish threads = [] while threads or crawl_queue: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) while len(threads) < max_threads and crawl_queue: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) # all threads have been processed # sleep temporarily so CPU can focus execution on other threads time.sleep(SLEEP_TIME)
def get_data(url): print(url) down = Downloader(headers=headers_home) path = 'cache/hz.meituan.com/index.html' if os.path.exists(path): os.remove(path) uuid = get_uuid('http://hz.meituan.com/', down) if not uuid: return data = {} type_ = 'c' + url.split('/c')[-1][:-1] print(type_) cateId = type_[1:] areaId = '-1' # print(cateId, areaId) data['FIRST_LEVEL_DIRECTORY'] = '生活服务' data['SECOND_LEVEL_DIRECTORY'] = class_[type_] down.headers = headers_get index = 0 while True: index = index + 1 down.headers['Referer'] = url + '/' + 'pn' + str(index) + '/' url_get = 'http://apimobile.meituan.com/group/v4/poi/pcsearch/50?uuid='+uuid+'&userid=-1&limit=32&offset='+str((index-1)*32)+'&cateId='+cateId+'&areaId='+areaId html = down(url_get) try: search_result = json.loads(html)['data']['searchResult'] except Exception as e: print('in get_data error ',e) if search_result == []: print('search_result is None') break # print(search_result) for one_item in search_result: data['SHOP_ID'] = one_item['id'] data['SHOP_PHOTOS'] = one_item['imageUrl'] data['SHOP_NAME'] = one_item['title'] data['ADDRESS'] = one_item['address'] data['RANK_STARS'] = one_item['avgscore'] data['AVG_PRICE_TITLE'] = one_item['avgprice'] tuangou = one_item['deals'] if not tuangou: data['GROUP_BUYING_NUMBER'] = 0 data['GROUP_BUYING'] = None else: data['GROUP_BUYING_NUMBER'] = len(tuangou) taocan = '' for one in tuangou: taocan = taocan + '价格' + str(one['price']) + ' 门市价' + str(one['value']) + ' 出售' + str(one['sales']) data['GROUP_BUYING'] = taocan db.insert_into(data)
def search(keyword): """ Google search for a keyword. """ D = Downloader() url = "https://www.google.com/search?q=" + ul.quote_plus(keyword) html = D(url) tree = lxml.html.fromstring(html) links = [] for result in tree.cssselect("h3.r a"): link = result.get("href") qs = ulp.urlparse(link).query links.extend(ulp.parse_qs(qs).get("q", [])) return links
def downloadlink(url='', title='', artist='', gender='', album=''): cleanMp3s() url = request.form['url'] title = request.form['title'] artist = request.form['artist'] gender = request.form['gender'] album = request.form['album'] downloader = Downloader(url, title, artist, gender, album) path = downloader.download() dir = 'files/' if not os.path.exists(dir): os.makedirs(dir) newpath = dir + path os.rename(path, newpath) return '<a href="/' + newpath + '">' + newpath + '</a>'
def down_info_by_id(one_id=None): if not one_id: return None data = {} down = Downloader(headers=headers_home) id = one_id['SHOP_ID'] sql = 'update crawler.mt_meishi set LABEL_IS_CCRAWLED = 2 where SHOP_ID = ' + id db.update_data(sql) url = HOMEURL + id + '/' uuid, data['TELEPHONE'], data['BUSINESS_TIME'] = get_uuid_phone_openTime_wifi(url, down) if uuid: data['REVIEW_COUNT'], data['NETIZEN_EVALUTION'] = get_review(uuid, id, url, down) if data['NETIZEN_EVALUTION'] == None: return limit = ''' ''' for key, value in data.items(): if data[key] != None: if type(data[key]) == int: limit = limit + str(key) + "=" + str(data[key]) + "," else: limit = limit + str(key) + "=" + "'" + data[key] + "'" + "," limit = limit[:-1] sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id db.update_data(sql) else: print('uuid is None') return limit = '' sql = '' now_time = datetime.now() now_time = str(now_time) now_time = now_time.split('.')[0] data['UPDATE_TIME'] = now_time data['LABEL_IS_CCRAWLED'] = 1 try: for key, value in data.items(): if data[key] != None: if type(data[key]) == int: limit = limit + str(key) + "=" + str(data[key]) + "," else: limit = limit + str(key) + "=" + "'" + data[key] + "'" + "," limit = limit[:-1] sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id db.update_data(sql) except Exception as e: print(e) pass
def main(reparse=False): """Main entry point for this ETL process. Downloads, updates db, stores the nightly data. This is the binary to run from a cron job. """ os.chdir(os.path.dirname(__file__)) logger = log.logger() logger.info('Starting ETL of FBO Nightly data.') # Figure out where we put data datadir = get_datadir() dbdir = get_dbdir() if not os.path.exists(os.path.join(dbdir, "sqlite3")): os.makedirs(os.path.join(dbdir, "sqlite3")) # Get a database connection, create db if needed db = model.FBO( "development", db_conf_file=os.path.join( dbdir, "dbconf.yml")) # Make sure the db schema is up to date, create tables, etc. db.migrate() assert os.path.exists(datadir) # Download raw data files dloader = Downloader(datadir, db, 'nightly') dloader.download(fname_urls, True) # Do our ETL nights = Nightlies(db) nights.etl_from_dir(reparse=reparse) # Close the db connection db.close() info('Finished ETL of FBO data.')
def install_cmake(package, **kwargs): if kwargs.get('check_installed', True) and installed('cmake'): return dir, version, minor = re.match(r'(cmake-(\d+\.\d+)\.(\d+).*-[^\.]+)\..*', package).groups() # extractall overwrites existing files, so no need to prepare the # destination. url = 'https://cmake.org/files/v{0}/{1}'.format(version, package) install_dir = kwargs.get('install_dir', opt_dir) with Downloader(kwargs.get('download_dir', '.')).download(url) as f: iszip = package.endswith('zip') with zipfile.ZipFile(f) if iszip \ else closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(install_dir) dir = os.path.join(install_dir, dir) if platform.system() == 'Darwin': dir = glob.glob(os.path.join(dir, 'CMake*.app', 'Contents'))[0] cmake_path = os.path.join(dir, 'bin', 'cmake') if install_dir != '.': add_to_path(cmake_path) return cmake_path
def __init__(self, src_name, inst_name, index_file = None, t_start = None, t_end = None, wavl_min = None, wavl_max = None, data_dir =''): # Set instrument properties self.source = src_name self.instrument = inst_name # check the keyword parameters if index_file != None: # Check if the file_list parameter is specified # Read index file into a list with each line as a separate element/filename with open(index_file) as f: file_list = f. readlines() file_list = [x.strip() for x in file_list] # Stip off newlines, and trailing and leading whitespace self.import_data(file_list) # If so, we are free to import data elif ((t_start != None) and (t_end != None) and (wavl_min != None) and (wavl_max != None)): # If not, we need to grab the data from the VSO # Find the available files for download using the Virtual Solar Observatory c = vso.VSOClient() # Initialize Sunpy VSO client #qr = c.query(vso.vso.attrs.Time(t_start, t_end), vso.vso.attrs.Instrument(self.instrument), vso.vso.attrs.Wave(wavl_min * u.AA, wavl_max * u.AA)) #qr = c.query(vso.vso.attrs.Time(t_start, t_end), vso.vso.attrs.Instrument(self.instrument)) qr = c.query_legacy(tstart=t_start, tend=t_end, instrument=self.instrument, min_wave=wavl_min, max_wave=wavl_max, unit_wave='Angstrom') # Query the VSO for files print(qr) # Print the query # Download the files returned by the query dw = Downloader() # Initialize custom downloader class r = c.get(qr, path = data_dir + '/{source}/{instrument}/{file}').wait() print(r) # Import the data # self.import_data(file_list) else: # Invalid keyword combination print('Incorrect keyword specification')
def process(rules): for rule in rules: download = Downloader() html = download.get(rule.url) if html == None: logger.error('%s无法访问'%rule.corp) continue elif rule.selector: text = process_selector(rule,html.text) elif rule.types == 'github': rule.selector = "div.commit-group-title" text = process_selector(rule,html.text) else: text = html.text if text == None: continue hash_list = dataConfig.hash_list() html_md5 = md5(text.encode('utf-8')) #text编码为unicode if debug: print 'html:',text[:20] print 'hash_list:',hash_list print 'html_md5',html_md5 if len(hash_list) > 0: if rule.corp in hash_list.keys(): if html_md5 == hash_list[rule.corp]: logger.info('%s no change'%rule.corp) else: #如果hash改变,说明有更新,发送邮件通知 logger.warning('%s has update'%rule.corp) dataConfig.update_hash(rule.corp,html_md5) context = '<a href={0}>{0}</a>'.format(rule.url) Notification(rule.message).notification(context) else: #如果不存在该corp,则添加该hash logger.info('添加新的监控app:%s'%rule.corp) dataConfig.add_hash(rule.corp,html_md5) else: #如果hash列表为空,则先初始化 logger.info('wam init ....') dataConfig.add_hash(rule.corp,html_md5)
def init_connection(self): try: self.vk_session = vk_api.VkApi(login=os.getenv("LOGIN"), password=os.getenv("PASSW")) try: self.vk_session.auth(token_only=True) except vk_api.AuthError as e: print(e) sys.exit(0) except vk_api.exceptions.Captcha as e: print("CAPTCHA") print(e.get_url()) code = input() e.try_again(key=code) print("ID:", os.getpid()) print("Got VK API Session") self.group_session = vk_api.VkApi(token=os.getenv("KEY")) print("Got Group Session") self.longpoll = VkBotLongPoll(self.group_session, os.getenv("GROUP_ID")) print("Got Longpoll Object") self.api = self.vk_session.get_api() print("Got API Object") self.group_api = self.group_session.get_api() print("Got Group API Object") self.upload = vk_api.VkUpload(self.vk_session) print("Got Upload Object") self.loader = Downloader() print("Got Downloader Object") except (requests.exceptions.ConnectionError) as e: print("Reinitializing session data") print(e) print("Timeout:", self.timeout) time.sleep(self.timeout) self.timeout += 1 self.init_connection()
# 'ci':'50', # ' _lxsdk_s': '1634d0b9358-04-0ab-c9c%7C%7C30' # } # user_agent_list = [ # 'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50', # 'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50', # 'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0', # 'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)', # 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1', # 'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1', # 'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11' # ] db = Database() down = Downloader(headers=headers_shop, cache=None) sql = "SELECT SHOP_ID, SECOND_LEVEL_DIRECTORY FROM crawler.mt_meishi where LABEL_IS_CCRAWLED = 0 and FIRST_LEVEL_DIRECTORY = '休闲娱乐' limit 50;" def update_shop_by_id(data): sql_hand = 'update crawler.mt_meishi set ' sql_end = ' where SHOP_ID = ' + str(data['SHOP_ID']) + ';' sql_body = '' for key, value in data.items(): if data[key] == None: continue elif type(data[key]) == int: sql_body = sql_body + key + ' = ' + str(data[key]) + ', ' else: sql_body = sql_body + key + ' = ' + "'" + str( data[key]) + "'" + ', '
import os from download import Downloader from uncompress import Uncompresser from makergb import MakeRGB from makepreview import MakePreview if __name__ == '__main__': with open("creds.txt","r") as f: lines = f.readlines() username = lines[0].strip() password = lines[1].strip() # create tool instances dler = Downloader(username=username,password=password,DEBUG=True) uncomp = Uncompresser(DEBUG=True) rgb = MakeRGB(DEBUG=True) prev = MakePreview(DEBUG=True) # create list of known images #LC80130312013273LGN00 prefix = 'LC8012031' #postfix = 'LGN01' images = [ '2014077LGN00', '2014061LGN01', '2014045LGN00', '2014029LGN00', '2014013LGN00', '2013362LGN00',
# -*-coding:utf-8-*- # coding: utf-8 # coding: gb2312 # __author__='admin' import urlparse import itertools from download import Downloader import re D = Downloader() class Crawler(object): def ID_crawler(self, start_url, max_depth=15, max_errors=3): global html depth = 1 num_errors = 0 for page in itertools: page_url = start_url + '/%d.html' % page if depth != max_depth: depth = depth + 1 html = D(page_url).text if html is None: num_errors = num_errors + 1 if num_errors == max_errors: print 'It is full of Errors.' break else: num_errors = 0 else:
def setUp(self): self.downloader = Downloader()
def __init__(self, key): self.dl = Downloader(key) self.currSymbol = "" self.currData = []
import sys from PyQt5 import QtGui, QtQuick, QtWidgets, QtCore from download import Downloader downloader = Downloader('http://cdimage.debian.org/debian-cd/8.4.0/amd64/iso-cd/debian-8.4.0-amd64-netinst.iso') app = QtWidgets.QApplication(sys.argv) view = QtQuick.QQuickView() view.rootContext().setContextProperty('downloader', downloader) view.setSource(QtCore.QUrl("download.qml")) view.show() app.exec_()
from subprocess import check_call build = os.environ['BUILD'] cmake_command = [ 'cmake', '-DFMT_EXTRA_TESTS=ON', '-DCMAKE_BUILD_TYPE=' + os.environ['CONFIG'] ] build_command = [ 'msbuild', '/m:4', '/p:Config=' + os.environ['CONFIG'], 'FORMAT.sln' ] test_command = ['msbuild', 'RUN_TESTS.vcxproj'] if build == 'mingw': # Install MinGW. mingw_url = 'http://ufpr.dl.sourceforge.net/project/mingw-w64/' + \ 'Toolchains%20targetting%20Win64/Personal%20Builds/mingw-builds/' + \ '4.9.2/threads-win32/seh/x86_64-4.9.2-release-win32-seh-rt_v3-rev1.7z' with Downloader().download(mingw_url) as f: check_call(['7z', 'x', '-oC:\\', f]) # Remove path to Git bin directory from $PATH because it breaks MinGW config. path = os.environ['PATH'].replace(r'C:\Program Files (x86)\Git\bin', '') os.environ[ 'PATH'] = r'C:\Program Files (x86)\MSBUILD\12.0\bin\;' + path + r';C:\mingw64\bin' cmake_command.append('-GMinGW Makefiles') build_command = ['mingw32-make', '-j4'] test_command = ['mingw32-make', 'test'] check_call(cmake_command) check_call(build_command)
def test_download_to_temp_dir(self): d = Downloader() with util.CaptureStdout(): with d.download('file://' + __file__) as f: filename = f self.assertEqual(tempfile.gettempdir(), os.path.dirname(filename))
ReleaseDate=g['releasetime'], WorkExperience=g['workexperience'], RecruitingNumbers=g['recruitnumbers'], WorkPlace=g['workplace'], EducationalRequirements=g['educationalrequirements'], JobCategory=g['jobcategory'], JobDescription=g['jobdescription']) if __name__ == '__main__': page_max = 90 pages_url = [ "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=全国&kw=嵌入式&p=%s" % str(i) for i in range(page_max) ] D = Downloader() #下载HTML import requests sqlcache = SQLCache() sqlqueue = SQLQueue() jobscache = JobsCache() sqlseen = Seen() drop(sqlqueue, sqlseen) jobscache.delete_all() jobscache = JobsCache() urlsx = [extract_urls(D(url)) for url in pages_url] for urls in urlsx: for url in urls: sqlqueue.append(url)
def download(url, cookie=None): return Downloader('.').download(url, cookie)
#!/usr/bin/env python from download import Downloader from fee import Fee from database import Fund import database if __name__ == '__main__': dl = Downloader() data = dl.get_fee('0P0000YXTA') fee = Fee(data) fund = Fund(management=fee.management, custodial=fee.custodial, distribution=fee.distribution) database.session.add(fund) database.session.commit()
from __future__ import print_function import os, re, shutil, tarfile, tempfile from bootstrap import bootstrap from contextlib import closing from download import Downloader from subprocess import call, check_call, check_output, Popen, PIPE, STDOUT build = os.environ['BUILD'] if build == 'doc': returncode = 1 travis = 'TRAVIS' in os.environ workdir = tempfile.mkdtemp() try: doxygen_url = 'http://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.10.linux.bin.tar.gz' dir = os.path.dirname(os.path.realpath(__file__)) with Downloader().download(doxygen_url) as f: with closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(dir) doxygen = os.path.join(dir, 'doxygen-1.8.10/bin/doxygen') returncode, repo_dir = __import__('build-docs').build_docs( workdir, doxygen) if returncode == 0 and os.environ['TRAVIS_BRANCH'] == 'master': # Push docs to GitHub pages if this is a master branch. if travis: check_call( ['git', 'config', '--global', 'user.name', 'amplbot']) check_call([ 'git', 'config', '--global', 'user.email', '*****@*****.**' ]) check_call(['git', 'add', '--all'], cwd=repo_dir) if call(['git', 'diff-index', '--quiet', 'HEAD'], cwd=repo_dir):
def down_shop_name(one_class=None): down = Downloader(headers=request_headers) # http://hz.meituan.com/meishi/api/poi/getPoiList?cityName=%E6%9D%AD%E5%B7%9E&cateId=20004&page=2 page = 0 def get_uuid(url): text_uuid = down(url) # print(text_uuid) re_uuid = re.compile(r'"uuid":"(.*?)",', re.IGNORECASE) try: uuid = re_uuid.findall(text_uuid)[0] except Exception as e: print('get uuid error: ', e) return None return uuid while True: html = '' page += 1 print('*****in*****down_shop_name*****') # print(one_class) # print('*****one class*****') type_code = one_class['DIRECTORY_CODE'] url_uuid = 'http://hz.meituan.com/meishi/%s/pn%d/' % (type_code, page) print(url_uuid) down.headers['Referer'] = url_uuid down.headers['Accept'] = home_request_headers_accept uuid = get_uuid(url_uuid) if not uuid: break type_code = type_code.replace('c', '') url = r'http://hz.meituan.com/meishi/api/poi/getPoiList?uuid=' + uuid + r'&platform=1&partner=126&originUrl=' + url_uuid + r'&riskLevel=1&optimusCode=1&cityName=%E6%9D%AD%E5%B7%9E&' + 'cateId=%s&areaId=0&sort=&dinnerCountAttrId=&page=%d&userId=0' % ( type_code, page) # print(url) # url = r'http://hz.meituan.com/meishi/c17/pn2/' down.headers['Accept'] = request_headers_accept html = down(url) # json_html = json.loads(html) # print(json_html['status']) # print(type(html)) if len(html) < 100: break # print(html[:100]) shop_info_list = extrace(html, type_code) threads = [] def write_data_to_db(): data = shop_info_list.pop() db = Database() db.insert_into(data) while shop_info_list or threads: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < 5 and shop_info_list: thread = threading.Thread(target=write_data_to_db) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(random.uniform(30, 60))