def download_clicked(self, _=None): limit_input = self.limit_entry.get() date_input = self.date_entry.get() type_input = self.type_dropdown.get() content_input = self.content_dropdown.get() limit_type = self.limit_text_entry.get() folder = self.download_folder_entry.get() try: if folder: folder = str(folder) if folder != clean_filename(folder): raise ValueError(texts.get('RANK_INVALID_FOLDER_ERROR')) else: folder = None rank_params = self.check_input(limit=limit_input, date=date_input, rank_type=type_input, content=content_input, limit_type=limit_type) rank_params['folder'] = folder download(target=self.pikax_handler.rank, kwargs=rank_params) except ValueError as e: import sys sys.stdout.write( texts.get('RANK_ERROR_MESSAGE').format(error_message=str(e)))
def download(): if downloadtuble.index(downloadcb.get()) == 1: section = common.downloadmode1() elif downloadtuble.index(downloadcb.get()) == 2: section = common.downloadmode2(downloadmode2en.get()) elif downloadtuble.index(downloadcb.get()) == 3: section = common.downloadmode3(downloadmode3en1.get(), downloadmode3en2.get()) else: return if section == -1: tkinter.messagebox.showerror('错误', '输入的不是数字') return elif section == 999: tkinter.messagebox.showerror('错误', '数字范围有误') return if not os.path.isdir(downloadpathstr.get()): tkinter.messagebox.showerror('错误', '路径有误') return downloadlogstr.set('开始下载') for i in range(section[0], section[1] + 1): downloadlogstr.set('正在下载第' + str(i) + '张CG图片') if i == 386 or i == 387: downloadlogstr.set('正在下载被和谐的图片,下载速度较慢') window.update() common.download(i, downloadpathstr.get()) time.sleep(0.1) downloadlogstr.set('下载完成') tkinter.messagebox.showinfo('提示', '下载完成')
def get_srtm_tile(srtm_tile, out_dir): """ Downloads and extract an srtm tile from the internet. Args: srtm_tile: string following the pattern 'srtm_%02d_%02d', identifying the desired strm tile out_dir: directory where to store and extract the srtm tiles """ # check if the tile is already there mkdir_p(out_dir) if os.path.exists(os.path.join(out_dir, '%s.tif' % srtm_tile)): return # download the zip file srtm_tile_url = '%s/%s.zip' % (cfg['srtm_url'], srtm_tile) zip_path = os.path.join(out_dir, '%s.zip' % srtm_tile) common.download(zip_path, srtm_tile_url) # extract the tif file if zipfile.is_zipfile(zip_path): z = zipfile.ZipFile(zip_path, 'r') z.extract('%s.tif' % srtm_tile, out_dir) else: print("%s not available" % srtm_tile) # remove the zip file os.remove(zip_path)
def crawl_sitemap(): """Download the sitemap, extract links by using a regex, download all links. """ sitemap = download(URL) links = re.findall('<loc>(.*?)</loc>', sitemap) for link in links: html = download(link)
def main(): sys.stdout.write(u'正在努力请求节目单...') sys.stdout.flush() data = common.open_url(list_url) if not data: return menu_list = json.loads(data)['list'] sys.stdout.write('\r') list_format = u'[{title}] by {author} | {player} {min:02}:{sec:02}' print u'{0:*^60}'.format(u'悦读FM.倾听文字的声音') print u'总共%d期.最新10期:'%len(menu_list) for i in range(0,10): print i,list_format.format(**menu_list[i]) print u"\n输入序号下载,以','分开.'q'退出" while 1: usr_input = raw_input('Select(0-%d):'%(len(menu_list)-1)) if usr_input == 'q': print 'bye!' break try: li = map(int, usr_input.split(',')) except: print 'Input Error!' for i in li: if 0 <= i < len(menu_list): common.download(menu_list[i]['mp3'], _TARGET,\ menu_list[i]['title'], 'mp3', Referer='http://yuedu.fm/') article2Html(i, menu_list[i]['title'])
def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False): ''' Create flowers test set reader. It returns a reader, each sample in the reader is image pixels in [0, 1] and label in [1, 102] translated from original color image by steps: 1. resize to 256*256 2. random crop to 224*224 3. flatten :param mapper: a function to map sample. :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int :param cycle: whether to cycle through the dataset :type cycle: bool :return: test data reader :rtype: callable ''' return reader_creator(download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper, buffered_size, use_xmap, cycle=cycle)
def download_clicked(self, _=None): try: params = self._get_params() download(target=self.pikax_handler.download_by_artist_id, kwargs=params) except ValueError as e: sys.stdout.write(str(e))
def down(dl_list, minmax): for image in dl_list: filename = image.split("/") filename = filename[-1].split("?") stdout.write("\rdownloading {}".format(filename)) stdout.flush() common.download(image, filename[0], minmax)
def main(): parser = argparse.ArgumentParser(description='download bilibili danmaku') parser.add_argument( '--id', metavar='ID', help='use a BV number, av number or bangumi media id to specify target video') parser.add_argument( '-t', '--type', metavar='TYPE', help='TYPE=BV/av/md' ) parser.add_argument( '-o', '--output', metavar='OUTPUT_PATH', default='.', help='files will be saved in OUTPUT_PATH, or a new subdirectory in OUTPUT_PATH, depending on --mkdir argument. OUTPUT_PATH is default to working path' ) parser.add_argument( '--mkdir', action='store_true', default=False, help='make new subdirectory in OUTPUT_PATH' ) parser.add_argument( '--use-name', action='store_true', default=False, help='use video title as the name of subdirectory, video parts and episodes' ) parser.add_argument( '--save-info', action='store_true', default=False, help='create a json file containing video information' ) args = parser.parse_args() if not check_arguments(args): sys.exit() print('Collecting information') info = asyncio.run(getinfo.get(args.type, args.id)) print('Title:', info.title) print('Video number:', info.n) if args.use_name: subdir = common.escape_filename(info.title) else: subdir = args.type + args.id if args.mkdir: target_path = os.path.join(args.output, subdir) if not os.path.isdir(target_path): os.mkdir(target_path) else: target_path = args.output if args.use_name: xml_list = [os.path.join(target_path, common.escape_filename(f'{i+1}.{title}.xml')) for i, title in enumerate(info.title_list)] else: xml_list = [os.path.join(target_path, str(_ + 1) + '.xml') for _ in range(info.n)] url_list = [common.comments_url(cid) for cid in info.cid_list] print('Downloading') common.download(url_list, xml_list) if args.save_info: if args.use_name: info_json_filename = common.escape_filename(info.title + '.json') else: info_json_filename = args.type + args.id + '.json' with open(os.path.join(target_path, info_json_filename), 'w', encoding='utf-8') as info_f: json.dump(info.save_info, info_f, ensure_ascii=False, indent=4) print('Complete.')
def get_srtm_tile(srtm_tile, out_dir): """ Downloads and extract an srtm tile from the internet. Args: srtm_tile: string following the pattern 'srtm_%02d_%02d', identifying the desired strm tile out_dir: directory where to store and extract the srtm tiles """ # check if the tile is already there if not os.path.exists(out_dir): os.makedirs(out_dir) if os.path.exists(os.path.join(out_dir, '%s.tif' % srtm_tile)): return # download the zip file srtm_tile_url = '%s/%s.zip' % (cfg['srtm_url'], srtm_tile) zip_path = os.path.join(out_dir, '%s.zip' % srtm_tile) common.download(zip_path, srtm_tile_url) # extract the tif file if zipfile.is_zipfile(zip_path): z = zipfile.ZipFile(zip_path, 'r') z.extract('%s.tif' % srtm_tile, out_dir) else: print "%s not available" % srtm_tile # remove the zip file os.remove(zip_path)
def search_and_download_clicked(self, _=None): try: keyword = str(self.keyword_entry.get()) if not keyword: raise ValueError(texts.get('SEARCH_EMPTY_KEYWORD_ERROR')) folder_input = str(self.download_folder_entry.get()) if folder_input != clean_filename(folder_input): raise ValueError(texts.get('SEARCH_INVALID_FOLDER_ERROR')) folder = folder_input or None try: limit_input = int( self.limit_entry.get()) if self.limit_entry.get() else None except ValueError: raise ValueError(texts.get('SEARCH_LIMIT_ERROR')) match_input = str(self.match_dropdown.get()) sort_input = str(self.sort_dropdown.get()) popularity_input = str(self.popularity_dropdown.get()) limit_type_input = str(self.limit_text_entry.get()) params = self.check_inputs(limit_input=limit_input, match_input=match_input, sort_input=sort_input, popularity_input=popularity_input, limit_type_input=limit_type_input) except (TypeError, ValueError) as e: sys.stdout.write( texts.get('SEARCH_ERROR_MESSAGE').format(error_message=str(e))) return params['keyword'] = keyword params['folder'] = folder download(target=self.pikax_handler.search, kwargs=params)
def down(dl_list, minmax): for image in dl_list: print(image) filename = image.split("/") filename = filename[-1] common.download(image, filename, minmax) print("Download complete!")
def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = download(link)
def download(self,url,filepath): #获取名称 name = self.getName(url) html = common.getHtml(url) m3u8 = self.getM3u8(html) common.download(urllib.unquote(m3u8),filepath,name + '.m3u8') url = self.URL_PIRFIX + self.getSinavideoUrl(filepath+name+'.m3u8') common.download(url,filepath,name+'.mp4')
def get_dict(): """ Get the word, verb and label dictionary of Wikipedia corpus. """ word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) return word_dict, verb_dict, label_dict
def download_clicked(self): self.canvas.itemconfigure(self.output_id, text='') user_input = self.id_or_url_input.get(0.0, tk.END) search_ids = re.findall(r'(?<!\d)\d{8}(?!\d)', user_input, re.S) if search_ids: params = {'illust_ids': search_ids} download(target=self.pikax_handler.download_by_illust_ids, kwargs=params) else: sys.stdout.write(texts.get('ILLUSTRATION_NO_ID_FOUND'))
def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall("<loc>(.*?)</loc>", sitemap) # download each link for link in links: html = download(link) # scrape html here # ... print(html)
def download_pic(url): print url html = common.open_url(url) find_re = re.compile(r'<li id.+?<img src="(.+?)"', re.DOTALL) img_url = find_re.findall(html) print 'Start download %d pics'%len(img_url) for url in img_url: if url: filename,ext = os.path.splitext(os.path.split(url)[-1]) if not ext: ext = '.jpg' common.download(url, TARGET, filename, ext[1:], Referer=url)
def download_show(li): for num in li: if num > 296: url = xml_url_1%num else: url = xml_url_2%num xml_data = common.open_url(url) if xml_data: songlist = extract(xml_data) target_dir = TARGET%num for title, location in songlist: ext = location.split('.')[-1] common.download(location, target_dir, title, ext, Referer=referer%num)
def main(): D = download(is_cookie=True) if not login(D, userid, passwd, dynamic_passwd): # login fail, return return hr_url = 'https://hr.guosen.com.cn/sso/SsoHrssServlet' html = D.get(hr_url) login_data = {} login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html) login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html) login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html) login_data['ReturnURL'] = 'https://hr.guosen.com.cn/sso/SsoHrssServlet' html = D.post('https://sso.guosen.com.cn/login.aspx', login_data) login_data = {} login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html) login_data['Result'] = '0' login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html) login_data['UserAccount'] = userid login_data['ErrorDescription'] = '' login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html) login_data['IASUserAccount'] = userid html = D.post(hr_url, login_data) end_url = 'https://hr.guosen.com.cn/hrss/ta/Clockin.jsp?_funcode=E0020902' D.get(end_url) post_url = 'https://hr.guosen.com.cn/hrss/dorado/smartweb2.RPC.d?__rpc=true' search_data = {} search_data['__type'] = 'loadData' search_data['__viewInstanceId'] = 'nc.bs.hrss.ta.Clockin~nc.bs.hrss.ta.ClockinViewModel' search_data['__xml'] = get_post_xml(month, year) html = D.post(post_url, search_data) if 'result succeed="true"' in html: print 'Hello world!' else: print html
def download(url, headers, proxy, num_retries, data=None): print('Downloading:%s'%url) request = request.Request(url, data, headers) opener = request.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(request.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except request.URLError as e: print('Download error:%s'%e.reason) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return download(url, headers, proxy, num_retries-1, data) else: code =None return html
def test(): word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict)
def get_dateil(): content = parse_content(url) links = [article['link'] for article in content] for link in links: article_tree = lxml.html.fromstring(download(link)) article_content = article_tree.cssselect('div#article_content > p')[0] print article_content
def extract_article_content(url): html = download(url) try: if html is not None: soup = BeautifulSoup(html, 'lxml') content_main = soup.select_one('#content-main') selectors_to_remove = [ '.article-function-social-media', '.article-icon.spiegelplus', '.article-function-box', 'script', 'style', '#js-article-column > p', '#js-article-top-wide-asset', '.asset-box', '.article-copyright', '.article-function-box-wide', '.top-anchor', '.module-box', '.spiegel-asset-box', '#spRecommendations', '#js-video-slider', '.column-both-bottom', '#footer' ] for selector in selectors_to_remove: for node in content_main.select(selector): node.decompose() for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): comment.extract() content = re.sub( '(\r\n|\n|\t|\s+)', ' ', reduce(lambda agg, cur: agg + ' ' + cur, content_main.findAll(text=True))) return content except Exception: print('extraction of {} failed ({})'.format(url, format_exc()))
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1): crawl_queue = deque([seed_url]) seen = {seed_url: 0} num_urls = 0 rp = get_robots(seed_url) throttle = Throttle(delay) headers = headers or {} if user_agent: headers['User-agent'] = user_agent while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): throttle.wait(url) html = download(url, headers, proxy=proxy, num_retries=num_retries).decode('utf-8') links = [] depth = seen[url] if depth != max_depth: if link_regex: links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) if link not in seen: seen[link] = depth + 1 if same_domain(seed_url, link): crawl_queue.append(link) num_urls += 1 if num_urls == max_urls: break else: print('Blocked by robots.txt:', url)
def scrape(): url = 'http://example.webscraping.com/places/default/view/United-Kindom-239' html = download(url) tree = lxml.html.fromstring(html) td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0] area = td.text_content() return area
def main(): common.setup() usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name()) if len(sys.argv) != 3: print usage sys.exit(1) src = sys.argv[1] if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'): print usage print ('gs_dst should be of the form /path/to/object. gs://{0} will be ' 'prefixed for you.').format(cfg.gs_bucket) sys.exit(1) dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2]) common.download(src, dst)
def main(): common.setup() usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name()) if len(sys.argv) != 3: print usage sys.exit(1) src = sys.argv[1] if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'): print usage print( 'gs_dst should be of the form /path/to/object. gs://{0} will be ' 'prefixed for you.').format(cfg.gs_bucket) sys.exit(1) dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2]) common.download(src, dst)
def link_crawler(seed_url, link_regex): crawl_queue = [seed_url] while crawl_queue: url = crawl_queue.pop() html = download(url).decode('utf-8') for link in get_links(html): if re.match(link_regex, link): crawl_queue.append(link)
def downloadQtPackge(): qt_version_dotless = qt_version.replace('.', '') base_url = 'https://download.qt.io/online/qtsdkrepository/{}/{}/qt5_{}' \ .format(os_url, target_platform ,qt_version_dotless) updates_file = 'Updates-{}-{}.xml'.format(qt_version, os_name) c.download(base_url + '/Updates.xml', updates_file) updates = ET.parse(updates_file) updates_root = updates.getroot() all_modules = {} for i in updates_root.iter('PackageUpdate'): name = i.find('Name').text if 'debug' in name or not kit_arch in name: continue archives = i.find('DownloadableArchives') if archives.text is None: continue archives_parts = archives.text.split(',') version = i.find('Version').text for archive in archives_parts: archive = archive.strip() parts = archive.split('-') module_name = parts[0] all_modules[module_name] = { 'package': name, 'file': version + archive } if len(sys.argv) > 1: # handle subcommand if sys.argv[1] == 'list': c.print('Available modules:') for k in iter(sorted(all_modules.keys())): c.print(k, '---', all_modules[k]['file']) exit(0) for module in qt_modules_list: if module not in all_modules: c.print('>> Required module {} not available'.format(module)) continue file_name = all_modules[module]['file'] package = all_modules[module]['package'] c.download(base_url + '/' + package + '/' + file_name, file_name) c.extract(file_name, '.')
def downOrExplain(self,aurl): fileName = aurl.replace('http://', '').replace('/', '_').replace('?', '_') + ".html" # print fileName # fileName="m1.html" if not os.path.exists(fileName): print "down file" html = download(aurl) self.saveHtml(fileName, html) self.html = self.readFile(fileName)
def test(): global UCI_TEST_DATA load_data(download(URL, 'uci_housing', MD5)) def reader(): for d in UCI_TEST_DATA: yield d[:-1], d[-1:] return reader
def iteration(): for page in itertools.count(1): # url = 'http://example.webscraping.com/view/-%d' % page url = 'http://example.webscraping.com/view/-{}'.format(page) html = download(url) if html is None: break else: pass
def download_mingw_tool(): qt_version_dotless = qt_version.replace('.', '') base_url = 'https://download.qt.io/online/qtsdkrepository/windows_x86/desktop/tools_mingw/' updates_file = 'Updates-{}-{}-{}.xml'.format(qt_version, os_name, 'qttool') c.download(base_url + '/Updates.xml', updates_file) updates = ET.parse(updates_file) updates_root = updates.getroot() all_modules = {} for i in updates_root.iter('PackageUpdate'): name = i.find('Name').text if 'debug' in name or not kit_arch in name: continue archives = i.find('DownloadableArchives') if archives.text is None: continue c.print(' archives: {}'.format(archives)) archives_parts = archives.text.split(',') version = i.find('Version').text c.print(' version: {}'.format(version)) for archive in archives_parts: archive = archive.strip() parts = archive.split('-') module_name = parts[0] all_modules[module_name] = { 'package': name, 'file': version + archive } if len(sys.argv) > 1: # handle subcommand if sys.argv[1] == 'list': c.print('Available modules:') for k in iter(sorted(all_modules.keys())): c.print(k, '---', all_modules[k]['file']) exit(0) file_name = all_modules[module_name]['file'] package = all_modules[module_name]['package'] c.print('download url: {}'.format(base_url + '/' + package + '/' + file_name)) c.download(base_url + '/' + package + '/' + file_name, file_name) c.extract(file_name, '.')
def link_crawler(seed_url, link_regex): """Crawl from the giver seed URL following links matched by link_regex""" crawl_queue = [seed_url] # the queue of URL's to download while crawl_queue: url = crawl_queue.pop() html = download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): # add this link to the crawl queue crawl_queue.append(link)
def train(mapper=train_mapper, buffered_size=1024, use_xmap=True): ''' Create flowers training set reader. It returns a reader, each sample in the reader is image pixels in [0, 1] and label in [1, 102] translated from original color image by steps: 1. resize to 256*256 2. random crop to 224*224 3. flatten :param mapper: a function to map sample. :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int :return: train data reader :rtype: callable ''' return reader_creator(download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper, buffered_size, use_xmap)
def test100(): """ CIFAR-100 test set cretor. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. :return: Test reader creator. :rtype: callable """ return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
def main(): if len(sys.argv) < 3 or (sys.argv[1] != '-t' and len(sys.argv) > 3): help_info() return if sys.argv[1] == '-a': url = _albumUrl % sys.argv[2] elif sys.argv[1] == '-c': url = _collectUrl % sys.argv[2] elif sys.argv[1] == '-t': url = _trackUrl % ','.join(sys.argv[2:]) else : help_info() return content = common.open_url(url) if not content: return res = extract(content) for title,uri,lrc in res: common.download(uri,TARGET,title,'mp3') if lrc:
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] # the queue of URL's to download while crawl_queue: url = crawl_queue.pop() html = download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): # add this link to the crawl queue crawl_queue.append(link)
def parse_content(url): html = download(url) tree = lxml.html.fromstring(html) td = tree.cssselect('ul.detail_list > li') for lis in td: item = {} item['title'] = lis.cssselect('h4 > a')[0].text_content() item['time'] = lis.cssselect('div.detail_b > span')[0].text_content() item['views'] = lis.cssselect('div.detail_b > em')[0].text_content() item['abstract'] = lis.cssselect('p.detail_p')[0].text_content() item['link'] = lis.cssselect('h4 > a')[0].attrib['href'] yield item
def train10(): """ CIFAR-10 training set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. :return: Training reader creator :rtype: callable """ return reader_creator( download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): ''' Create flowers validation set reader. It returns a reader, each sample in the reader is image pixels in [0, 1] and label in [1, 102] translated from original color image by steps: 1. resize to 256*256 2. random crop to 224*224 3. flatten :param mapper: a function to map sample. :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int :return: test data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper, buffered_size, use_xmap)
def iteration(): max_errors = 5 # maximum number of consecutive download errors allowed num_errors = 0 # current number of consecutive download errors for page in itertools.count(1): url = 'http://example.webscraping.com/view/-{}'.format(page) html = download(url) if html is None: # received an error trying to download this webpage num_errors += 1 if num_errors == max_errors: # reached maximum amount of errors in a row so exit break # so assume have reached the last country ID and can stop downloading else: # success - can scrape the result # ... num_errors = 0
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] seen = set(crawl_queue) # keep track which URL's have seen before while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): # check if link matches expected regex if re.match(link_regex, link): # form absolute link link = urlparse.urljoin(seed_url, link) # check if have already seen this link if link not in seen: seen.add(link) crawl_queue.append(link)
def test(): """ Conll05 test set creator. Because the training dataset is not free, the test dataset is used for training. It returns a reader creator, each sample in the reader is nine features, including sentence sequence, predicate, predicate context, predicate context flag and tagged sequence. :return: Training reader creator :rtype: callable """ word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict)
def __initialize_meta_info__(): fn = download(URL, "movielens", MD5) global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') with zipfile.ZipFile(file=fn) as package: for info in package.infolist(): assert isinstance(info, zipfile.ZipInfo) MOVIE_INFO = dict() title_word_set = set() categories_set = set() with package.open('ml-1m/movies.dat') as movie_file: for i, line in enumerate(movie_file): movie_id, title, categories = line.strip().split('::') categories = categories.split('|') for c in categories: categories_set.add(c) title = pattern.match(title).group(1) MOVIE_INFO[int(movie_id)] = MovieInfo( index=movie_id, categories=categories, title=title) for w in title.split(): title_word_set.add(w.lower()) global MOVIE_TITLE_DICT MOVIE_TITLE_DICT = dict() for i, w in enumerate(title_word_set): MOVIE_TITLE_DICT[w] = i global CATEGORIES_DICT CATEGORIES_DICT = dict() for i, c in enumerate(categories_set): CATEGORIES_DICT[c] = i global USER_INFO USER_INFO = dict() with package.open('ml-1m/users.dat') as user_file: for line in user_file: uid, gender, age, job, _ = line.strip().split("::") USER_INFO[int(uid)] = UserInfo( index=uid, gender=gender, age=age, job_id=job) return fn
def get_image(self, image_object, try_web=True): image_path = self.get_image_path(image_object.get_small_basename()) temp_path = self.get_temp_path(image_object.get_small_basename()) if os.path.exists(image_path) and os.path.isfile(image_path): try: pixbuf = gtk.gdk.pixbuf_new_from_file(image_path) except gobject.GError: try: os.unlink(image_path) except: pass else: del pixbuf return image_path if try_web: small_image_url = image_object.small_url if small_image_url: ret = common.download(small_image_url, temp_path) if ret and self.cleanup_small(temp_path, image_path): return image_path return None
cwd = os.getcwd() start_time = time.time() if args.begin_phase <= 1: print('\n======= Phase I, downloading data =======') for name, url_tuple in baseline_data.items(): if verbose: print('Downloading ' +str(name)) sys.stdout.flush() path = os.path.join('datasets/', name) # if url_tuple[RES_LOCATION].startswith('http') or \ # url_tuple[RES_LOCATION].startswith('ftp'): loc = url_tuple[RES_LOCATION] if any([loc.startswith(x) for x in ['file', 'ftp', 'http']]): download(url_tuple[RES_LOCATION], path) print(loc) print('Phase 1 ran in %.3f minutes' % ((time.time() - start_time) / 60)) if args.end_phase == 1: print('\nTerminating process after phase 1 as specified by user.') print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60)) sys.exit() else: print('\nSkipping phase 1.') sys.stdout.flush() if args.begin_phase <= 2: print('\n======= Phase II, parsing data =======') # For now, download and store the data in the parsed.py module. This module
if not os.path.exists(resource_dir): os.mkdir(resource_dir) # change to resource directory os.chdir(resource_dir) # make dataset directory if not os.path.exists(path_constants.dataset_dir): os.mkdir(path_constants.dataset_dir) # create empty dictionary to hold all ns values and equivalence gp_dict = {} # parse reference dataset (entrez gene) for path, url in gp_reference.file_to_url.items(): download(url, path) parser = gp_reference.parser_class(gp_reference.file_to_url) print("Running " + str(parser)) (gene_dict, history_dict) = parser.parse() gp_dict.update(gene_dict) # parse dependent datasets for d in gp_datasets: for path, url in d.file_to_url.items(): download(url, path) parser = d.parser_class(gene_dict, history_dict, d.file_to_url) print("Running " + str(parser)) gp_dict.update(parser.parse()) print("Completed gene protein resource generation.") print("Number of namespace entries: %d" %(len(gp_dict)))
def fetch(): download(DATA_URL, 'flowers', DATA_MD5) download(LABEL_URL, 'flowers', LABEL_MD5) download(SETID_URL, 'flowers', SETID_MD5)
def fetch(): download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) download(EMB_URL, 'conll05st', EMB_MD5) download(DATA_URL, 'conll05st', DATA_MD5)
def fetch(): return download(URL, "MQ2007", MD5)
def get_embedding(): """ Get the trained word vector based on Wikipedia corpus. """ return download(EMB_URL, 'conll05st', EMB_MD5)
change_log['MESHCL'] = {} change_log['MESHD'] = {} change_log['SCHEM'] = {} change_log['SDIS'] = {} change_log['DO'] = {} change_log['DOID'] = {} # download the data needed for resolving lost values print('\nDownloading data needed for resolving changed/lost terms...') if not os.path.exists('changelog_datasets/'): os.mkdir('changelog_datasets/') for name, data_tuple in changelog_data.items(): if verbose: print('Downloading ' +str(data_tuple[RES_LOCATION])) path = os.path.join('changelog_datasets/', name) if 'ftp' in data_tuple[RES_LOCATION] or 'http' in data_tuple[RES_LOCATION]: download(data_tuple[RES_LOCATION], path) print('Resolving changed/lost terms...') sp_accession_ids = [] for label, data_tuple in changelog_data.items(): url = label parser = data_tuple[PARSER_TYPE]('changelog_datasets/'+url) if str(parser) == 'EntrezGeneHistory_Parser': log = change_log.get('EGID') if verbose: print('\nGathering Entrez update info...') for row in parser.parse(): discontinued_id = row.get('Discontinued_GeneID') gid = row.get('GeneID') replacement_id = gid if gid != '-' else 'withdrawn'
def fetch(): download(CIFAR10_URL, 'cifar', CIFAR10_MD5) download(CIFAR100_URL, 'cifar', CIFAR100_MD5)