def __init__(self, config: dict): assert "url" in config, "vid2scene: missing video stream url" assert "id" in config, "vid2scene: missing scene id" # meta configs self.config = config self.debug = config.get("debug", False) self.display = config.get("display", False) self.gvr = config.get("gvr", None) # video capture and motion detect self.url = config["url"] self.cap_interval = config.get("cap_interval", 1) self.vcap = None self.md = None # ml model states self.net = None self.classes = list() self.output_layers = None self.colors = None self.conf_thresh = config.get("vid2scene_conf_thresh", 0.5) # confidence threshold # download model weights if not exist if not os.path.isfile(_net_weight_file): print("vid2scene: missing model weights file, downloading..") download_url(_net_weight_url, _net_weight_file) # state store self.scene = SceneStatus(id_=config["id"]) self.scene.status = defaultdict(dict)
def download_glove(): glove_file = 'data/glove.6B.zip' glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip' if not os.path.isfile(glove_file): util.download_url(glove_url, 'data/glove.6B.zip') with zipfile.ZipFile(glove_file, 'r') as zip_ref: zip_ref.extractall('data/glove.6B')
def download_espgame(root): espgame_path = os.path.join(root, 'ESP-ImageSet') # create directory if not os.path.exists(root): os.makedirs(root) if not os.path.exists(espgame_path): parts = urlparse(espgame_url) filename = os.path.basename(parts.path) tmp_path = os.path.join(root, 'tmp') cached_file = os.path.join(tmp_path, filename) if not os.path.exists(tmp_path): os.makedirs(tmp_path) if not os.path.exists(cached_file): print('Downloading: "{}" to {}\n'.format(espgame_url, cached_file)) util.download_url(espgame_url, cached_file) # extract file print('[dataset] Extracting tar file {file} to {path}'.format(file=cached_file, path=root)) cwd = os.getcwd() tar = tarfile.open(cached_file, "r:gz") os.chdir(root) tar.extractall() tar.close() os.chdir(cwd) print('[dataset] Done!')
def post(self): server = 'ftp.ebi.ac.uk' user = '******' ftp = ftplib.FTP(server) ftp.login(user) #go_dir = 'pub' #ftp.cwd(go_dir) version_pat = re.compile(r'!GO-version: .*/(\d{4}-\d\d-\d\d)/go\.owl$') revision_pat = re.compile(r'<a href="/viewvc/GO-SVN\?view=revision&revision=(\d+)"') # get latest version versions = self.get_current_versions() # naming scheme: species_version_date.gaf.gz for spec in self.species: name = self.species_names[spec] # locate the GAF file on the GOA server remote_dir = '/pub/databases/GO/goa/%s' %(name.upper()) remote_file = 'gene_association.goa_%s.gz' %(name.lower()) remote_path = '%s/%s' %(remote_dir,remote_file) url = 'ftp://%s%s' %(server,remote_path) file_name = '%s_%s_%s.gaf.gz' %(spec,versions[name][0],versions[name][1]) # get file size remote_size = ftp.size(remote_path) logger.debug('Remote file size: %s', str(remote_size)) # check if we need to download the file by comparing it to the local file (if it exists) gaf_file = self.data_dir + os.sep + file_name if os.path.isfile(gaf_file) and os.path.getsize(gaf_file) == remote_size: continue # also skip downloading OBO file # download file logger.debug('Downloading file "%s"...', url) util.download_url(url, gaf_file) # make sure download was successful if (not os.path.isfile(gaf_file)) or (os.path.getsize(gaf_file) != remote_size): logger.debug('Download unsuccessful! Deleting file...') if os.path.isfile(gaf_file): # race condition? os.remove(gaf_file) # get corresponding gene ontology version fromt the header of the GAF file version = self.get_gaf_ontology_version(gaf_file) # get the url of the corresponding "go-basic.obo" file on the GO SVN server url = self.get_obo_url(version) obo_file = self.data_dir + os.sep + '%s_%s_%s.obo' %(spec,versions[name][0],versions[name][1]) # download the obo file util.download_url(url, obo_file) self.data['go_annotations'] = GOAnnotationData.find_go_annotations(self.data_dir)
def refresh_arch_repo(repo): """ Refresh all mirrors of an arch linux repo """ fname = '{0!s}.db'.format(repo.repo_id) for mirror in repo.mirror_set.filter(refresh=True): res = find_mirror_url(mirror.url, [fname]) mirror.last_access_ok = response_is_valid(res) if mirror.last_access_ok: mirror_url = res.url text = 'Found arch repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) data = download_url(res, 'Downloading repo info:') if data is None: mirror.fail() return sha1 = get_sha1(data) if mirror.file_checksum == sha1: text = 'Mirror checksum has not changed, ' text += 'not refreshing package metadata' warning_message.send(sender=None, text=text) else: packages = extract_arch_packages(data) mirror.last_access_ok = True mirror.timestamp = datetime.now() update_mirror_packages(mirror, packages) mirror.file_checksum = sha1 packages.clear() else: mirror.fail() mirror.save()
def customize_bert_vocab(): vocab_filename = BERT_VOCAB_FILE.format(BERT_MODEL) vocab_url = PRETRAINED_VOCAB_ARCHIVE_MAP[BERT_MODEL] util.download_url(vocab_url, vocab_filename) vocab = list(load_bert_vocab(vocab_filename).keys()) # load_vocab gives an OrderedDict custom_tokens = ['[SPKR_A]', '[SPKR_B]', '<laughter>'] # TODO: add disfluencies # most of the first 1000 tokens are [unusedX], but [PAD], [CLS], etc are scattered in there too for new_token in custom_tokens: for i, existing_token in enumerate(vocab): if re.match(r"\[unused\d+\]", existing_token): vocab[i] = new_token log.info("Custom BERT vocab: {} -> {} (replaced {})".format(new_token, i, existing_token)) break elif i > 1000: raise ValueError("Couldn't find any unused tokens to replace :(") with open(vocab_filename, 'w', encoding="utf-8") as f: for token in vocab: f.write(token + '\n')
def refresh_yum_repo(mirror, data, mirror_url, ts): """ Refresh package metadata for a yum-style rpm mirror and add the packages to the mirror """ primary_url, checksum, checksum_type = get_primary_url(mirror_url, data) if not primary_url: mirror.fail() return res = get_url(primary_url) mirror.last_access_ok = response_is_valid(res) if not mirror.last_access_ok: mirror.fail() return data = download_url(res, 'Downloading repo info (2/2):') if data is None: mirror.fail() return sha = get_sha(checksum_type, data) if sha is None: mirror.fail() return if not checksum_is_valid(sha, checksum, mirror): mirror.fail() return if mirror.file_checksum == checksum: text = 'Mirror checksum has not changed, ' text += 'not refreshing package metadata' warning_message.send(sender=None, text=text) return mirror.file_checksum = checksum if hasattr(settings, 'MAX_MIRRORS') and \ isinstance(settings.MAX_MIRRORS, int): max_mirrors = settings.MAX_MIRRORS # only refresh X mirrors, where X = max_mirrors checksum_q = Q(mirrorlist=False, refresh=True, timestamp=ts, file_checksum=checksum) have_checksum = mirror.repo.mirror_set.filter(checksum_q).count() if have_checksum >= max_mirrors: text = '{0!s} mirrors already have this '.format(max_mirrors) text += 'checksum, ignoring refresh to save time' info_message.send(sender=None, text=text) else: packages = extract_yum_packages(data, primary_url) if packages: update_mirror_packages(mirror, packages)
def refresh_rpm_repo(repo): """ Refresh an rpm repo. Checks if the repo url is a mirrorlist, and extracts mirrors if so. If not, checks a number of common rpm repo formats to determine which type of repo it is, and to determine the mirror urls. """ formats = [ 'repodata/repomd.xml.bz2', 'repodata/repomd.xml.gz', 'repodata/repomd.xml', 'suse/repodata/repomd.xml.bz2', 'suse/repodata/repomd.xml.gz', 'suse/repodata/repomd.xml', 'content', ] if lzma is not None: formats.insert(0, 'repodata/repomd.xml.xz') formats.insert(4, 'suse/repodata/repomd.xml.xz') check_for_mirrorlists(repo) check_for_metalinks(repo) if hasattr(settings, 'MAX_MIRRORS') and \ isinstance(settings.MAX_MIRRORS, int): max_mirrors = settings.MAX_MIRRORS ts = datetime.now().replace(microsecond=0) enabled_mirrors = repo.mirror_set.filter(mirrorlist=False, refresh=True) for i, mirror in enumerate(enabled_mirrors): res = find_mirror_url(mirror.url, formats) mirror.last_access_ok = response_is_valid(res) if mirror.last_access_ok: if i >= max_mirrors: text = '{0!s} mirrors already refreshed, '.format(max_mirrors) text += ' not refreshing {0!s}'.format(mirror.url) warning_message.send(sender=None, text=text) continue data = download_url(res, 'Downloading repo info (1/2):') if data is None: mirror.fail() return mirror_url = res.url if res.url.endswith('content'): text = 'Found yast rpm repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) refresh_yast_repo(mirror, data) else: text = 'Found yum rpm repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) refresh_yum_repo(mirror, data, mirror_url, ts) mirror.timestamp = ts else: mirror.fail() mirror.save()
def get_images(self): self.img_dir = os.path.join(self.config.img_dir, self.get_column("B")) if not os.path.exists(self.img_dir): os.makedirs(self.img_dir) self.img_paths = [] index = 0 for url in self.get_column("I").split(","): img_name = url.rsplit("/", 1)[-1] # path = os.path.join(self.record_path, str(index).zfill(3) + ".jpg") path = os.path.join(self.img_dir, img_name) if not self.config.cache_img or not os.path.exists(path): util.download_url(url, path) time.sleep(1) if os.stat(path).st_size > 0: self.img_paths.append(path) else: self.logger.warn("Removed file: " + path + " from url: " + url) os.remove(path) index += 1
def _download(self, filename: str) -> bool: try: filepath = path.join(self.root, filename) destination = f"{self.root}/{filename.replace('.deb', '')}" tmp_dir = path.join(self.root, '.tmp') if not path.exists(destination): download_url( f'http://ftp.de.debian.org/debian/pool/main/a/agda/{filename}', filepath) unzip(filepath) os.mkdir(tmp_dir) Archive(filepath).extractall(tmp_dir) data_tar = path.join(tmp_dir, 'data.tar') Archive(data_tar).extractall(tmp_dir) shutil.move(f"{tmp_dir}/usr/bin/agda", destination) shutil.rmtree(tmp_dir) os.remove(filepath) return True except Exception as e: log.error(f"Could not download and install: {e}") return False
def refresh_rpm_repo(repo): """ Refresh an rpm repo. Checks if the repo url is a mirrorlist, and extracts mirrors if so. If not, checks a number of common rpm repo formats to determine which type of repo it is, and to determine the mirror urls. """ formats = [ 'repodata/repomd.xml.bz2', 'repodata/repomd.xml.gz', 'repodata/repomd.xml', 'suse/repodata/repomd.xml.bz2', 'suse/repodata/repomd.xml.gz', 'suse/repodata/repomd.xml', 'content', ] if lzma is not None: formats.insert(0, 'repodata/repomd.xml.xz') formats.insert(4, 'suse/repodata/repomd.xml.xz') check_for_mirrorlists(repo) check_for_metalinks(repo) ts = datetime.now().replace(microsecond=0) for mirror in repo.mirror_set.filter(mirrorlist=False, refresh=True): res = find_mirror_url(mirror.url, formats) mirror.last_access_ok = response_is_valid(res) if mirror.last_access_ok: data = download_url(res, 'Downloading repo info (1/2):') if data is None: mirror.fail() return mirror_url = res.url if res.url.endswith('content'): text = 'Found yast rpm repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) refresh_yast_repo(mirror, data) else: text = 'Found yum rpm repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) refresh_yum_repo(mirror, data, mirror_url, ts) mirror.timestamp = ts else: mirror.fail() mirror.save()
def get_mirrorlist_urls(url): """ Checks if a given url returns a mirrorlist by checking if it is of type text/plain and contains a list of urls. Returns a list of mirrors if it is a mirrorlist. """ res = get_url(url) if response_is_valid(res): if 'content-type' in res.headers and \ 'text/plain' in res.headers['content-type']: data = download_url(res, 'Downloading repo info:') if data is None: return mirror_urls = re.findall(b'^http://.*$|^ftp://.*$', data, re.MULTILINE) if mirror_urls: return mirror_urls
def get_metalink_urls(url): """ Parses a metalink and returns a list of mirrors """ res = get_url(url) if response_is_valid(res): if 'content-type' in res.headers and \ res.headers['content-type'] == 'application/metalink+xml': data = download_url(res, 'Downloading repo info:') ns = 'http://www.metalinker.org/' try: context = etree.parse(BytesIO(data), etree.XMLParser()) except etree.XMLSyntaxError: context = etree.parse(BytesIO(extract(data, 'gz')), etree.XMLParser()) xpath = "//ns:files/ns:file[@name='repomd.xml']/ns:resources/ns:url[@protocol='https']" # noqa metalink_urls = context.xpath(xpath, namespaces={'ns': ns}) return [x.text for x in metalink_urls]
def crawl_reddit(self): current_page_url = self.start_url logging.getLogger('requests').setLevel(logging.WARNING) logging.debug('Starting to crawl page {}'.format(self.start_url)) #headers = {'User-Agent': 'SearchingBot 0.1'} ok_url_count = 0 error_url_count = 0 while True: if (ok_url_count + error_url_count) % 100 == 0: logging.info("Crawled {} oks - {} errors".format( ok_url_count, error_url_count)) current_page = download_reddit_url( current_page_url ) # requests.get(current_page_url, headers=headers) logging.debug('Current page: {}'.format(current_page_url)) soup = BeautifulSoup(current_page) links = [ Crawler._make_absolute_url(a['href']) for a in soup.find_all('a', attrs={'class': 'title'}) if not (a['href'].startswith('http') or a['href'].startswith('javascript')) ] try: for link in links: ok_url_count += 1 html = download_url(link) stored_text_file_name = os.path.join( self.storage_dir, base64.b16encode(link)) with open(stored_text_file_name, 'w') as storage_file: storage_file.write(html.encode('utf-8')) time.sleep(2) except Exception as e: logging.error(u'Error occured while crawling {}'.format( current_page_url)) logging.exception(e) error_url_count += 1 next_page_url = soup.find('a', attrs={'rel': 'next'})['href'] logging.debug('First post is {}'.format(links[0])) current_page_url = next_page_url ok_url_count += 1 time.sleep(2)
def refresh_yast_repo(mirror, data): """ Refresh package metadata for a yast-style rpm mirror and add the packages to the mirror """ package_dir = re.findall('DESCRDIR *(.*)', data.decode('utf-8'))[0] package_url = '{0!s}/{1!s}/packages.gz'.format(mirror.url, package_dir) res = get_url(package_url) mirror.last_access_ok = response_is_valid(res) if mirror.last_access_ok: data = download_url(res, 'Downloading repo info (2/2):') if data is None: mirror.fail() return mirror.file_checksum = 'yast' packages = extract_yast_packages(data) if packages: update_mirror_packages(mirror, packages) else: mirror.fail()
def refresh_arch_repo(repo): """ Refresh all mirrors of an arch linux repo """ if hasattr(settings, 'MAX_MIRRORS') and \ isinstance(settings.MAX_MIRRORS, int): max_mirrors = settings.MAX_MIRRORS fname = '{0!s}/{1!s}.db'.format(repo.arch, repo.repo_id) ts = datetime.now().replace(microsecond=0) for i, mirror in enumerate(repo.mirror_set.filter(refresh=True)): res = find_mirror_url(mirror.url, [fname]) mirror.last_access_ok = response_is_valid(res) if mirror.last_access_ok: if i >= max_mirrors: text = '{0!s} mirrors already refreshed, '.format(max_mirrors) text += ' not refreshing {0!s}'.format(mirror.url) warning_message.send(sender=None, text=text) continue mirror_url = res.url text = 'Found arch repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) data = download_url(res, 'Downloading repo info:') if data is None: mirror.fail() return computed_checksum = get_checksum(data, Checksum.sha1) if mirror.file_checksum == computed_checksum: text = 'Mirror checksum has not changed, ' text += 'not refreshing package metadata' warning_message.send(sender=None, text=text) else: packages = extract_arch_packages(data) mirror.last_access_ok = True mirror.timestamp = ts update_mirror_packages(mirror, packages) mirror.file_checksum = computed_checksum packages.clear() else: mirror.fail() mirror.save()
def refresh_deb_repo(repo): """ Refresh a debian repo. Checks for the Packages* files to determine what the mirror urls are and then downloads and extracts packages from those files. """ formats = ['Packages.bz2', 'Packages.gz', 'Packages'] if lzma is not None: formats.insert(0, 'Packages.xz') ts = datetime.now().replace(microsecond=0) for mirror in repo.mirror_set.filter(refresh=True): res = find_mirror_url(mirror.url, formats) mirror.last_access_ok = response_is_valid(res) if mirror.last_access_ok: mirror_url = res.url text = 'Found deb repo - {0!s}'.format(mirror_url) info_message.send(sender=None, text=text) data = download_url(res, 'Downloading repo info:') if data is None: mirror.fail() return computed_checksum = get_checksum(data, Checksum.sha1) if mirror.file_checksum == computed_checksum: text = 'Mirror checksum has not changed, ' text += 'not refreshing package metadata' warning_message.send(sender=None, text=text) else: packages = extract_deb_packages(data, mirror_url) mirror.last_access_ok = True mirror.timestamp = ts update_mirror_packages(mirror, packages) mirror.file_checksum = computed_checksum packages.clear() else: mirror.fail() mirror.save()
def crawl_reddit(self): current_page_url = self.start_url logging.getLogger('requests').setLevel(logging.WARNING) logging.debug('Starting to crawl page {}'.format(self.start_url)) #headers = {'User-Agent': 'SearchingBot 0.1'} ok_url_count = 0 error_url_count = 0 while True: if (ok_url_count + error_url_count) % 100 == 0: logging.info("Crawled {} oks - {} errors".format(ok_url_count, error_url_count)) current_page = download_reddit_url(current_page_url) # requests.get(current_page_url, headers=headers) logging.debug('Current page: {}'.format(current_page_url)) soup = BeautifulSoup(current_page) links = [Crawler._make_absolute_url(a['href']) for a in soup.find_all('a', attrs={'class': 'title'}) if not (a['href'].startswith('http') or a['href'].startswith('javascript'))] try: for link in links: ok_url_count += 1 html = download_url(link) stored_text_file_name = os.path.join(self.storage_dir, base64.b16encode(link)) with open(stored_text_file_name, 'w') as storage_file: storage_file.write(html.encode('utf-8')) time.sleep(2) except Exception as e: logging.error(u'Error occured while crawling {}'.format(current_page_url)) logging.exception(e) error_url_count += 1 next_page_url = soup.find('a', attrs={'rel': 'next'})['href'] logging.debug('First post is {}'.format(links[0])) current_page_url = next_page_url ok_url_count += 1 time.sleep(2)
def download_errata(): """ Download CentOS errata from https://cefs.steve-meier.de/ """ res = get_url('https://cefs.steve-meier.de/errata.latest.xml.bz2') return download_url(res, 'Downloading CentOS Errata:')
source, resume = sys.argv[1].strip(), sys.argv[2] else: print_help() raise SystemExit(1) # See if archive folder already exists for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'): if os.path.exists(out_dir): break else: out_dir = OUTPUT_DIR # Step 0: Download url to local file (only happens if a URL is specified instead of local path) if source and any( source.startswith(s) for s in ('http://', 'https://', 'ftp://')): source = download_url(source) elif stdin_raw_text: source = save_source(stdin_raw_text) # Step 1: Parse the links and dedupe them with existing archive links = merge_links(archive_path=out_dir, import_path=source, only_new=ONLY_NEW) # Step 2: Write new index write_links_index(out_dir=out_dir, links=links) # Step 3: Verify folder structure is 1:1 with index # cleanup_archive(out_dir, links) # Step 4: Run the archive methods for each link
def crawl_wikipedia(self): def check_a_node(a): if a and a.get('href', None): url = a['href'] ignore_urls_starts = [ '/wiki/Wikipedia', '/wiki/Special', '/wiki/Category', '/wiki/Template_talk' '/wiki/Book', '/wiki/Template', '/wiki/Talk', '/wiki/BookSources', '/wiki/File' ] if url.startswith('/wiki') and not url.split( ':')[0] in ignore_urls_starts: return True return False def make_absolute_wiki_url(url): return 'https://en.wikipedia.org' + url def prepare_url(url): return make_absolute_wiki_url(re.split(r'#', a['href'])[0]) start_time = time.time() current_page_url = self.start_url logging.getLogger('requests').setLevel(logging.WARNING) logging.debug('Starting to crawl page {}'.format(self.start_url)) #headers = {'User-Agent': 'SearchingBot 0.1'} ok_url_count = 0 error_url_count = 0 url_number = 0 links_to_crawl = deque() links_to_crawl.append(current_page_url) crawled_links = set() while True: url = links_to_crawl.popleft() if not url.startswith( 'https://en.wikipedia.org') or url in crawled_links: continue if (ok_url_count + error_url_count) % 100 == 0: logging.info("Crawled {} oks - {} errors".format( ok_url_count, error_url_count)) try: current_page = download_url(url) logging.debug('{}. 200: {}'.format(url_number, url)) except Exception as e: status_code = e.message logging.warning('{}. {}: {}'.format(url_number, status_code, url)) continue url_number += 1 soup = BeautifulSoup(current_page, 'html.parser') for tag in soup(['style', 'script']): tag.extract() links_to_crawl.extend([ prepare_url(a['href']) for a in soup.find_all('a') if check_a_node(a) ]) try: stored_text_file_name = os.path.join(self.storage_dir, base64.b16encode(url)) with open(stored_text_file_name, 'w') as storage_file: storage_file.write(soup.get_text().encode('utf-8')) # time.sleep(2) except Exception as e: logging.error(u'Error occured while crawling {}'.format( current_page_url)) logging.exception(e) error_url_count += 1 ok_url_count += 1 crawled_links.add(url) if ok_url_count >= self.urls_to_crawl: break logging.debug('Total time: {}'.format(time.time() - start_time))
def download_voc2007(root): path_devkit = os.path.join(root, 'VOCdevkit') path_images = os.path.join(root, 'VOCdevkit', 'VOC2007', 'JPEGImages') tmpdir = os.path.join(root, 'tmp') # create directory if not os.path.exists(root): os.makedirs(root) if not os.path.exists(path_devkit): if not os.path.exists(tmpdir): os.makedirs(tmpdir) parts = urlparse(urls['devkit']) filename = os.path.basename(parts.path) cached_file = os.path.join(tmpdir, filename) if not os.path.exists(cached_file): print('Downloading: "{}" to {}\n'.format(urls['devkit'], cached_file)) util.download_url(urls['devkit'], cached_file) # extract file print('[dataset] Extracting tar file {file} to {path}'.format( file=cached_file, path=root)) cwd = os.getcwd() tar = tarfile.open(cached_file, "r") os.chdir(root) tar.extractall() tar.close() os.chdir(cwd) print('[dataset] Done!') # train/val images/annotations if not os.path.exists(path_images): # download train/val images/annotations parts = urlparse(urls['trainval_2007']) filename = os.path.basename(parts.path) cached_file = os.path.join(tmpdir, filename) if not os.path.exists(cached_file): print('Downloading: "{}" to {}\n'.format(urls['trainval_2007'], cached_file)) util.download_url(urls['trainval_2007'], cached_file) # extract file print('[dataset] Extracting tar file {file} to {path}'.format( file=cached_file, path=root)) cwd = os.getcwd() tar = tarfile.open(cached_file, "r") os.chdir(root) tar.extractall() tar.close() os.chdir(cwd) print('[dataset] Done!') # test annotations test_anno = os.path.join(path_devkit, 'VOC2007/ImageSets/Main/aeroplane_test.txt') if not os.path.exists(test_anno): # download test annotations parts = urlparse(urls['test_images_2007']) filename = os.path.basename(parts.path) cached_file = os.path.join(tmpdir, filename) if not os.path.exists(cached_file): print('Downloading: "{}" to {}\n'.format(urls['test_images_2007'], cached_file)) util.download_url(urls['test_images_2007'], cached_file) # extract file print('[dataset] Extracting tar file {file} to {path}'.format( file=cached_file, path=root)) cwd = os.getcwd() tar = tarfile.open(cached_file, "r") os.chdir(root) tar.extractall() tar.close() os.chdir(cwd) print('[dataset] Done!') # test images test_image = os.path.join(path_devkit, 'VOC2007/JPEGImages/000001.jpg') if not os.path.exists(test_image): # download test images parts = urlparse(urls['test_anno_2007']) filename = os.path.basename(parts.path) cached_file = os.path.join(tmpdir, filename) if not os.path.exists(cached_file): print('Downloading: "{}" to {}\n'.format(urls['test_anno_2007'], cached_file)) util.download_url(urls['test_anno_2007'], cached_file) # extract file print('[dataset] Extracting tar file {file} to {path}'.format( file=cached_file, path=root)) cwd = os.getcwd() tar = tarfile.open(cached_file, "r") os.chdir(root) tar.extractall() tar.close() os.chdir(cwd) print('[dataset] Done!')
def crawl_wikipedia(self): def check_a_node(a): if a and a.get('href', None): url = a['href'] ignore_urls_starts = ['/wiki/Wikipedia', '/wiki/Special', '/wiki/Category', '/wiki/Template_talk' '/wiki/Book', '/wiki/Template', '/wiki/Talk', '/wiki/BookSources', '/wiki/File'] if url.startswith('/wiki') and not url.split(':')[0] in ignore_urls_starts: return True return False def make_absolute_wiki_url(url): return 'https://en.wikipedia.org' + url def prepare_url(url): return make_absolute_wiki_url(re.split(r'#', a['href'])[0]) start_time = time.time() current_page_url = self.start_url logging.getLogger('requests').setLevel(logging.WARNING) logging.debug('Starting to crawl page {}'.format(self.start_url)) #headers = {'User-Agent': 'SearchingBot 0.1'} ok_url_count = 0 error_url_count = 0 url_number = 0 links_to_crawl = deque() links_to_crawl.append(current_page_url) crawled_links = set() while True: url = links_to_crawl.popleft() if not url.startswith('https://en.wikipedia.org') or url in crawled_links: continue if (ok_url_count + error_url_count) % 100 == 0: logging.info("Crawled {} oks - {} errors".format(ok_url_count, error_url_count)) try: current_page = download_url(url) logging.debug('{}. 200: {}'.format(url_number, url)) except Exception as e: status_code = e.message logging.warning('{}. {}: {}'.format(url_number, status_code, url)) continue url_number += 1 soup = BeautifulSoup(current_page, 'html.parser') for tag in soup(['style', 'script']): tag.extract() links_to_crawl.extend( [prepare_url(a['href']) for a in soup.find_all('a') if check_a_node(a)]) try: stored_text_file_name = os.path.join(self.storage_dir, base64.b16encode(url)) with open(stored_text_file_name, 'w') as storage_file: storage_file.write(soup.get_text().encode('utf-8')) # time.sleep(2) except Exception as e: logging.error(u'Error occured while crawling {}'.format(current_page_url)) logging.exception(e) error_url_count += 1 ok_url_count += 1 crawled_links.add(url) if ok_url_count >= self.urls_to_crawl: break logging.debug('Total time: {}'.format(time.time() - start_time))
def download_errata_checksum(): """ Download CentOS errata checksum from https://cefs.steve-meier.de/ """ res = get_url('https://cefs.steve-meier.de/errata.latest.sha1') return download_url(res, 'Downloading Errata Checksum:')