コード例 #1
0
    def __init__(self, config: dict):
        assert "url" in config, "vid2scene: missing video stream url"
        assert "id" in config, "vid2scene: missing scene id"

        # meta configs
        self.config = config
        self.debug = config.get("debug", False)
        self.display = config.get("display", False)
        self.gvr = config.get("gvr", None)

        # video capture and motion detect
        self.url = config["url"]
        self.cap_interval = config.get("cap_interval", 1)
        self.vcap = None
        self.md = None

        # ml model states
        self.net = None
        self.classes = list()
        self.output_layers = None
        self.colors = None
        self.conf_thresh = config.get("vid2scene_conf_thresh",
                                      0.5)  # confidence threshold

        # download model weights if not exist
        if not os.path.isfile(_net_weight_file):
            print("vid2scene: missing model weights file, downloading..")
            download_url(_net_weight_url, _net_weight_file)

        # state store
        self.scene = SceneStatus(id_=config["id"])
        self.scene.status = defaultdict(dict)
コード例 #2
0
def download_glove():
    glove_file = 'data/glove.6B.zip'
    glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    if not os.path.isfile(glove_file): 
        util.download_url(glove_url, 'data/glove.6B.zip')
    with  zipfile.ZipFile(glove_file, 'r') as zip_ref:
        zip_ref.extractall('data/glove.6B')
コード例 #3
0
def download_espgame(root):

    espgame_path = os.path.join(root, 'ESP-ImageSet')

    # create directory
    if not os.path.exists(root):
        os.makedirs(root)

    if not os.path.exists(espgame_path):
        parts = urlparse(espgame_url)
        filename = os.path.basename(parts.path)
        tmp_path = os.path.join(root, 'tmp')
        cached_file = os.path.join(tmp_path, filename)

        if not os.path.exists(tmp_path):
            os.makedirs(tmp_path)

        if not os.path.exists(cached_file):
            print('Downloading: "{}" to {}\n'.format(espgame_url, cached_file))
            util.download_url(espgame_url, cached_file)

        # extract file
        print('[dataset] Extracting tar file {file} to {path}'.format(file=cached_file, path=root))
        cwd = os.getcwd()
        tar = tarfile.open(cached_file, "r:gz")
        os.chdir(root)
        tar.extractall()
        tar.close()
        os.chdir(cwd)
        print('[dataset] Done!')
コード例 #4
0
    def post(self):
        server = 'ftp.ebi.ac.uk'
        user = '******'
        ftp = ftplib.FTP(server)
        ftp.login(user)

        #go_dir = 'pub'
        #ftp.cwd(go_dir)
        version_pat = re.compile(r'!GO-version: .*/(\d{4}-\d\d-\d\d)/go\.owl$')
        revision_pat = re.compile(r'<a href="/viewvc/GO-SVN\?view=revision&amp;revision=(\d+)"')

        # get latest version
        versions = self.get_current_versions()

        # naming scheme: species_version_date.gaf.gz
        for spec in self.species:

            name = self.species_names[spec]

            # locate the GAF file on the GOA server
            remote_dir = '/pub/databases/GO/goa/%s' %(name.upper())
            remote_file = 'gene_association.goa_%s.gz' %(name.lower())
            remote_path = '%s/%s' %(remote_dir,remote_file)
            url = 'ftp://%s%s' %(server,remote_path)
            file_name = '%s_%s_%s.gaf.gz' %(spec,versions[name][0],versions[name][1])

            # get file size
            remote_size = ftp.size(remote_path)
            logger.debug('Remote file size: %s', str(remote_size))

            # check if we need to download the file by comparing it to the local file (if it exists)
            gaf_file = self.data_dir + os.sep + file_name
            if os.path.isfile(gaf_file) and os.path.getsize(gaf_file) == remote_size:
                continue # also skip downloading OBO file

            # download file
            logger.debug('Downloading file "%s"...', url)
            util.download_url(url, gaf_file)

            # make sure download was successful
            if (not os.path.isfile(gaf_file)) or (os.path.getsize(gaf_file) != remote_size):
                logger.debug('Download unsuccessful! Deleting file...')
                if os.path.isfile(gaf_file): # race condition?
                    os.remove(gaf_file)

            # get corresponding gene ontology version fromt the header of the GAF file
            version = self.get_gaf_ontology_version(gaf_file)
            # get the url of the corresponding "go-basic.obo" file on the GO SVN server
            url = self.get_obo_url(version)
            obo_file = self.data_dir + os.sep + '%s_%s_%s.obo' %(spec,versions[name][0],versions[name][1])
            # download the obo file
            util.download_url(url, obo_file)

        self.data['go_annotations'] = GOAnnotationData.find_go_annotations(self.data_dir)
コード例 #5
0
ファイル: utils.py プロジェクト: j4ckzh0u/patchman
def refresh_arch_repo(repo):
    """ Refresh all mirrors of an arch linux repo
    """
    fname = '{0!s}.db'.format(repo.repo_id)
    for mirror in repo.mirror_set.filter(refresh=True):
        res = find_mirror_url(mirror.url, [fname])
        mirror.last_access_ok = response_is_valid(res)

        if mirror.last_access_ok:
            mirror_url = res.url
            text = 'Found arch repo - {0!s}'.format(mirror_url)
            info_message.send(sender=None, text=text)
            data = download_url(res, 'Downloading repo info:')
            if data is None:
                mirror.fail()
                return
            sha1 = get_sha1(data)
            if mirror.file_checksum == sha1:
                text = 'Mirror checksum has not changed, '
                text += 'not refreshing package metadata'
                warning_message.send(sender=None, text=text)
            else:
                packages = extract_arch_packages(data)
                mirror.last_access_ok = True
                mirror.timestamp = datetime.now()
                update_mirror_packages(mirror, packages)
                mirror.file_checksum = sha1
                packages.clear()
        else:
            mirror.fail()
        mirror.save()
コード例 #6
0
def customize_bert_vocab():
    vocab_filename = BERT_VOCAB_FILE.format(BERT_MODEL) 
    vocab_url = PRETRAINED_VOCAB_ARCHIVE_MAP[BERT_MODEL]
    util.download_url(vocab_url, vocab_filename)
    vocab = list(load_bert_vocab(vocab_filename).keys()) # load_vocab gives an OrderedDict 
    custom_tokens = ['[SPKR_A]', '[SPKR_B]', '<laughter>'] # TODO: add disfluencies
    # most of the first 1000 tokens are [unusedX], but [PAD], [CLS], etc are scattered in there too 
    for new_token in custom_tokens:
        for i, existing_token in enumerate(vocab):
            if re.match(r"\[unused\d+\]", existing_token):
                vocab[i] = new_token
                log.info("Custom BERT vocab: {} -> {} (replaced {})".format(new_token, i, existing_token))
                break
            elif i > 1000:
                raise ValueError("Couldn't find any unused tokens to replace :(")
    with open(vocab_filename, 'w', encoding="utf-8") as f:
        for token in vocab:
            f.write(token + '\n')
コード例 #7
0
def refresh_yum_repo(mirror, data, mirror_url, ts):
    """ Refresh package metadata for a yum-style rpm mirror
        and add the packages to the mirror
    """

    primary_url, checksum, checksum_type = get_primary_url(mirror_url, data)

    if not primary_url:
        mirror.fail()
        return

    res = get_url(primary_url)
    mirror.last_access_ok = response_is_valid(res)

    if not mirror.last_access_ok:
        mirror.fail()
        return

    data = download_url(res, 'Downloading repo info (2/2):')
    if data is None:
        mirror.fail()
        return

    sha = get_sha(checksum_type, data)
    if sha is None:
        mirror.fail()
        return

    if not checksum_is_valid(sha, checksum, mirror):
        mirror.fail()
        return

    if mirror.file_checksum == checksum:
        text = 'Mirror checksum has not changed, '
        text += 'not refreshing package metadata'
        warning_message.send(sender=None, text=text)
        return

    mirror.file_checksum = checksum

    if hasattr(settings, 'MAX_MIRRORS') and \
            isinstance(settings.MAX_MIRRORS, int):
        max_mirrors = settings.MAX_MIRRORS
        # only refresh X mirrors, where X = max_mirrors
        checksum_q = Q(mirrorlist=False,
                       refresh=True,
                       timestamp=ts,
                       file_checksum=checksum)
        have_checksum = mirror.repo.mirror_set.filter(checksum_q).count()
        if have_checksum >= max_mirrors:
            text = '{0!s} mirrors already have this '.format(max_mirrors)
            text += 'checksum, ignoring refresh to save time'
            info_message.send(sender=None, text=text)
        else:
            packages = extract_yum_packages(data, primary_url)
            if packages:
                update_mirror_packages(mirror, packages)
コード例 #8
0
ファイル: utils.py プロジェクト: sdaru/patchman
def refresh_rpm_repo(repo):
    """ Refresh an rpm repo.
        Checks if the repo url is a mirrorlist, and extracts mirrors if so.
        If not, checks a number of common rpm repo formats to determine
        which type of repo it is, and to determine the mirror urls.
    """

    formats = [
        'repodata/repomd.xml.bz2',
        'repodata/repomd.xml.gz',
        'repodata/repomd.xml',
        'suse/repodata/repomd.xml.bz2',
        'suse/repodata/repomd.xml.gz',
        'suse/repodata/repomd.xml',
        'content',
    ]

    if lzma is not None:
        formats.insert(0, 'repodata/repomd.xml.xz')
        formats.insert(4, 'suse/repodata/repomd.xml.xz')

    check_for_mirrorlists(repo)
    check_for_metalinks(repo)

    if hasattr(settings, 'MAX_MIRRORS') and \
           isinstance(settings.MAX_MIRRORS, int):
        max_mirrors = settings.MAX_MIRRORS
    ts = datetime.now().replace(microsecond=0)
    enabled_mirrors = repo.mirror_set.filter(mirrorlist=False, refresh=True)
    for i, mirror in enumerate(enabled_mirrors):
        res = find_mirror_url(mirror.url, formats)
        mirror.last_access_ok = response_is_valid(res)
        if mirror.last_access_ok:
            if i >= max_mirrors:
                text = '{0!s} mirrors already refreshed, '.format(max_mirrors)
                text += ' not refreshing {0!s}'.format(mirror.url)
                warning_message.send(sender=None, text=text)
                continue
            data = download_url(res, 'Downloading repo info (1/2):')
            if data is None:
                mirror.fail()
                return
            mirror_url = res.url
            if res.url.endswith('content'):
                text = 'Found yast rpm repo - {0!s}'.format(mirror_url)
                info_message.send(sender=None, text=text)
                refresh_yast_repo(mirror, data)
            else:
                text = 'Found yum rpm repo - {0!s}'.format(mirror_url)
                info_message.send(sender=None, text=text)
                refresh_yum_repo(mirror, data, mirror_url, ts)
            mirror.timestamp = ts
        else:
            mirror.fail()
        mirror.save()
コード例 #9
0
    def get_images(self):
        self.img_dir = os.path.join(self.config.img_dir, self.get_column("B"))
        if not os.path.exists(self.img_dir):
            os.makedirs(self.img_dir)
        self.img_paths = []
        index = 0
        for url in self.get_column("I").split(","):
            img_name = url.rsplit("/", 1)[-1]
            # path = os.path.join(self.record_path, str(index).zfill(3) + ".jpg")
            path = os.path.join(self.img_dir, img_name)

            if not self.config.cache_img or not os.path.exists(path):
                util.download_url(url, path)
                time.sleep(1)
            if os.stat(path).st_size > 0:
                self.img_paths.append(path)
            else:
                self.logger.warn("Removed file: " + path + " from url: " + url)
                os.remove(path)
            index += 1
コード例 #10
0
ファイル: switcher.py プロジェクト: bourbaki-network/refl
    def _download(self, filename: str) -> bool:
        try:
            filepath = path.join(self.root, filename)
            destination = f"{self.root}/{filename.replace('.deb', '')}"
            tmp_dir = path.join(self.root, '.tmp')

            if not path.exists(destination):
                download_url(
                    f'http://ftp.de.debian.org/debian/pool/main/a/agda/{filename}',
                    filepath)
                unzip(filepath)
                os.mkdir(tmp_dir)
                Archive(filepath).extractall(tmp_dir)
                data_tar = path.join(tmp_dir, 'data.tar')
                Archive(data_tar).extractall(tmp_dir)
                shutil.move(f"{tmp_dir}/usr/bin/agda", destination)
                shutil.rmtree(tmp_dir)
                os.remove(filepath)
            return True
        except Exception as e:
            log.error(f"Could not download and install: {e}")
            return False
コード例 #11
0
def refresh_rpm_repo(repo):
    """ Refresh an rpm repo.
        Checks if the repo url is a mirrorlist, and extracts mirrors if so.
        If not, checks a number of common rpm repo formats to determine
        which type of repo it is, and to determine the mirror urls.
    """

    formats = [
        'repodata/repomd.xml.bz2',
        'repodata/repomd.xml.gz',
        'repodata/repomd.xml',
        'suse/repodata/repomd.xml.bz2',
        'suse/repodata/repomd.xml.gz',
        'suse/repodata/repomd.xml',
        'content',
    ]

    if lzma is not None:
        formats.insert(0, 'repodata/repomd.xml.xz')
        formats.insert(4, 'suse/repodata/repomd.xml.xz')

    check_for_mirrorlists(repo)
    check_for_metalinks(repo)

    ts = datetime.now().replace(microsecond=0)

    for mirror in repo.mirror_set.filter(mirrorlist=False, refresh=True):

        res = find_mirror_url(mirror.url, formats)
        mirror.last_access_ok = response_is_valid(res)

        if mirror.last_access_ok:
            data = download_url(res, 'Downloading repo info (1/2):')
            if data is None:
                mirror.fail()
                return
            mirror_url = res.url
            if res.url.endswith('content'):
                text = 'Found yast rpm repo - {0!s}'.format(mirror_url)
                info_message.send(sender=None, text=text)
                refresh_yast_repo(mirror, data)
            else:
                text = 'Found yum rpm repo - {0!s}'.format(mirror_url)
                info_message.send(sender=None, text=text)
                refresh_yum_repo(mirror, data, mirror_url, ts)
            mirror.timestamp = ts
        else:
            mirror.fail()
        mirror.save()
コード例 #12
0
def get_mirrorlist_urls(url):
    """ Checks if a given url returns a mirrorlist by checking if it is of
        type text/plain and contains a list of urls. Returns a list of
        mirrors if it is a mirrorlist.
    """
    res = get_url(url)
    if response_is_valid(res):
        if 'content-type' in res.headers and \
           'text/plain' in res.headers['content-type']:
            data = download_url(res, 'Downloading repo info:')
            if data is None:
                return
            mirror_urls = re.findall(b'^http://.*$|^ftp://.*$', data,
                                     re.MULTILINE)
            if mirror_urls:
                return mirror_urls
コード例 #13
0
def get_metalink_urls(url):
    """  Parses a metalink and returns a list of mirrors
    """
    res = get_url(url)
    if response_is_valid(res):
        if 'content-type' in res.headers and \
           res.headers['content-type'] == 'application/metalink+xml':
            data = download_url(res, 'Downloading repo info:')
            ns = 'http://www.metalinker.org/'
            try:
                context = etree.parse(BytesIO(data), etree.XMLParser())
            except etree.XMLSyntaxError:
                context = etree.parse(BytesIO(extract(data, 'gz')),
                                      etree.XMLParser())
            xpath = "//ns:files/ns:file[@name='repomd.xml']/ns:resources/ns:url[@protocol='https']"  # noqa
            metalink_urls = context.xpath(xpath, namespaces={'ns': ns})
            return [x.text for x in metalink_urls]
コード例 #14
0
    def crawl_reddit(self):
        current_page_url = self.start_url
        logging.getLogger('requests').setLevel(logging.WARNING)
        logging.debug('Starting to crawl page {}'.format(self.start_url))

        #headers = {'User-Agent': 'SearchingBot 0.1'}
        ok_url_count = 0
        error_url_count = 0
        while True:

            if (ok_url_count + error_url_count) % 100 == 0:
                logging.info("Crawled {} oks - {} errors".format(
                    ok_url_count, error_url_count))
            current_page = download_reddit_url(
                current_page_url
            )  # requests.get(current_page_url, headers=headers)
            logging.debug('Current page: {}'.format(current_page_url))

            soup = BeautifulSoup(current_page)
            links = [
                Crawler._make_absolute_url(a['href'])
                for a in soup.find_all('a', attrs={'class': 'title'})
                if not (a['href'].startswith('http')
                        or a['href'].startswith('javascript'))
            ]
            try:
                for link in links:
                    ok_url_count += 1
                    html = download_url(link)
                    stored_text_file_name = os.path.join(
                        self.storage_dir, base64.b16encode(link))
                    with open(stored_text_file_name, 'w') as storage_file:
                        storage_file.write(html.encode('utf-8'))
                    time.sleep(2)
            except Exception as e:
                logging.error(u'Error occured while crawling {}'.format(
                    current_page_url))
                logging.exception(e)
                error_url_count += 1

            next_page_url = soup.find('a', attrs={'rel': 'next'})['href']
            logging.debug('First post is {}'.format(links[0]))
            current_page_url = next_page_url
            ok_url_count += 1
            time.sleep(2)
コード例 #15
0
def refresh_yast_repo(mirror, data):
    """ Refresh package metadata for a yast-style rpm mirror
        and add the packages to the mirror
    """
    package_dir = re.findall('DESCRDIR *(.*)', data.decode('utf-8'))[0]
    package_url = '{0!s}/{1!s}/packages.gz'.format(mirror.url, package_dir)
    res = get_url(package_url)
    mirror.last_access_ok = response_is_valid(res)
    if mirror.last_access_ok:
        data = download_url(res, 'Downloading repo info (2/2):')
        if data is None:
            mirror.fail()
            return
        mirror.file_checksum = 'yast'
        packages = extract_yast_packages(data)
        if packages:
            update_mirror_packages(mirror, packages)
    else:
        mirror.fail()
コード例 #16
0
ファイル: utils.py プロジェクト: sdaru/patchman
def refresh_arch_repo(repo):
    """ Refresh all mirrors of an arch linux repo
    """
    if hasattr(settings, 'MAX_MIRRORS') and \
           isinstance(settings.MAX_MIRRORS, int):
        max_mirrors = settings.MAX_MIRRORS
    fname = '{0!s}/{1!s}.db'.format(repo.arch, repo.repo_id)
    ts = datetime.now().replace(microsecond=0)
    for i, mirror in enumerate(repo.mirror_set.filter(refresh=True)):
        res = find_mirror_url(mirror.url, [fname])
        mirror.last_access_ok = response_is_valid(res)
        if mirror.last_access_ok:
            if i >= max_mirrors:
                text = '{0!s} mirrors already refreshed, '.format(max_mirrors)
                text += ' not refreshing {0!s}'.format(mirror.url)
                warning_message.send(sender=None, text=text)
                continue
            mirror_url = res.url
            text = 'Found arch repo - {0!s}'.format(mirror_url)
            info_message.send(sender=None, text=text)
            data = download_url(res, 'Downloading repo info:')
            if data is None:
                mirror.fail()
                return
            computed_checksum = get_checksum(data, Checksum.sha1)
            if mirror.file_checksum == computed_checksum:
                text = 'Mirror checksum has not changed, '
                text += 'not refreshing package metadata'
                warning_message.send(sender=None, text=text)
            else:
                packages = extract_arch_packages(data)
                mirror.last_access_ok = True
                mirror.timestamp = ts
                update_mirror_packages(mirror, packages)
                mirror.file_checksum = computed_checksum
                packages.clear()
        else:
            mirror.fail()
        mirror.save()
コード例 #17
0
ファイル: utils.py プロジェクト: sdaru/patchman
def refresh_deb_repo(repo):
    """ Refresh a debian repo.
        Checks for the Packages* files to determine what the mirror urls
        are and then downloads and extracts packages from those files.
    """

    formats = ['Packages.bz2', 'Packages.gz', 'Packages']
    if lzma is not None:
        formats.insert(0, 'Packages.xz')

    ts = datetime.now().replace(microsecond=0)
    for mirror in repo.mirror_set.filter(refresh=True):
        res = find_mirror_url(mirror.url, formats)
        mirror.last_access_ok = response_is_valid(res)

        if mirror.last_access_ok:
            mirror_url = res.url
            text = 'Found deb repo - {0!s}'.format(mirror_url)
            info_message.send(sender=None, text=text)
            data = download_url(res, 'Downloading repo info:')
            if data is None:
                mirror.fail()
                return
            computed_checksum = get_checksum(data, Checksum.sha1)
            if mirror.file_checksum == computed_checksum:
                text = 'Mirror checksum has not changed, '
                text += 'not refreshing package metadata'
                warning_message.send(sender=None, text=text)
            else:
                packages = extract_deb_packages(data, mirror_url)
                mirror.last_access_ok = True
                mirror.timestamp = ts
                update_mirror_packages(mirror, packages)
                mirror.file_checksum = computed_checksum
                packages.clear()
        else:
            mirror.fail()
        mirror.save()
コード例 #18
0
ファイル: crawler.py プロジェクト: Anton-Baton/search_engine
	def crawl_reddit(self):
		current_page_url = self.start_url
		logging.getLogger('requests').setLevel(logging.WARNING)
		logging.debug('Starting to crawl page {}'.format(self.start_url))

		#headers = {'User-Agent': 'SearchingBot 0.1'}	
		ok_url_count = 0
		error_url_count = 0
		while True:	
			
			if (ok_url_count + error_url_count) % 100 == 0:
				logging.info("Crawled {} oks - {} errors".format(ok_url_count, error_url_count))
			current_page = download_reddit_url(current_page_url)  # requests.get(current_page_url, headers=headers) 
			logging.debug('Current page: {}'.format(current_page_url))

			soup = BeautifulSoup(current_page)
			links = [Crawler._make_absolute_url(a['href']) for a in soup.find_all('a', attrs={'class': 'title'})
					if not (a['href'].startswith('http') or a['href'].startswith('javascript'))]
			try:	
				for link in links:
					ok_url_count += 1
					html = download_url(link)
					stored_text_file_name = os.path.join(self.storage_dir, base64.b16encode(link))
					with open(stored_text_file_name, 'w') as storage_file:
						storage_file.write(html.encode('utf-8'))
					time.sleep(2)
			except Exception as e:
				logging.error(u'Error occured while crawling {}'.format(current_page_url))
				logging.exception(e)
				error_url_count += 1

			next_page_url = soup.find('a', attrs={'rel': 'next'})['href']
			logging.debug('First post is {}'.format(links[0]))
			current_page_url = next_page_url
			ok_url_count += 1
			time.sleep(2)
コード例 #19
0
def download_errata():
    """ Download CentOS errata from https://cefs.steve-meier.de/
    """
    res = get_url('https://cefs.steve-meier.de/errata.latest.xml.bz2')
    return download_url(res, 'Downloading CentOS Errata:')
コード例 #20
0
        source, resume = sys.argv[1].strip(), sys.argv[2]
    else:
        print_help()
        raise SystemExit(1)

    # See if archive folder already exists
    for out_dir in (OUTPUT_DIR, 'bookmarks', 'pocket', 'pinboard', 'html'):
        if os.path.exists(out_dir):
            break
    else:
        out_dir = OUTPUT_DIR

    # Step 0: Download url to local file (only happens if a URL is specified instead of local path)
    if source and any(
            source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        source = download_url(source)
    elif stdin_raw_text:
        source = save_source(stdin_raw_text)

    # Step 1: Parse the links and dedupe them with existing archive
    links = merge_links(archive_path=out_dir,
                        import_path=source,
                        only_new=ONLY_NEW)

    # Step 2: Write new index
    write_links_index(out_dir=out_dir, links=links)

    # Step 3: Verify folder structure is 1:1 with index
    # cleanup_archive(out_dir, links)

    # Step 4: Run the archive methods for each link
コード例 #21
0
    def crawl_wikipedia(self):
        def check_a_node(a):
            if a and a.get('href', None):
                url = a['href']
                ignore_urls_starts = [
                    '/wiki/Wikipedia', '/wiki/Special', '/wiki/Category',
                    '/wiki/Template_talk'
                    '/wiki/Book', '/wiki/Template', '/wiki/Talk',
                    '/wiki/BookSources', '/wiki/File'
                ]
                if url.startswith('/wiki') and not url.split(
                        ':')[0] in ignore_urls_starts:
                    return True
            return False

        def make_absolute_wiki_url(url):
            return 'https://en.wikipedia.org' + url

        def prepare_url(url):
            return make_absolute_wiki_url(re.split(r'#', a['href'])[0])

        start_time = time.time()
        current_page_url = self.start_url
        logging.getLogger('requests').setLevel(logging.WARNING)
        logging.debug('Starting to crawl page {}'.format(self.start_url))

        #headers = {'User-Agent': 'SearchingBot 0.1'}
        ok_url_count = 0
        error_url_count = 0
        url_number = 0
        links_to_crawl = deque()
        links_to_crawl.append(current_page_url)
        crawled_links = set()
        while True:
            url = links_to_crawl.popleft()
            if not url.startswith(
                    'https://en.wikipedia.org') or url in crawled_links:
                continue

            if (ok_url_count + error_url_count) % 100 == 0:
                logging.info("Crawled {} oks - {} errors".format(
                    ok_url_count, error_url_count))
            try:
                current_page = download_url(url)
                logging.debug('{}. 200: {}'.format(url_number, url))
            except Exception as e:
                status_code = e.message
                logging.warning('{}. {}: {}'.format(url_number, status_code,
                                                    url))
                continue
            url_number += 1

            soup = BeautifulSoup(current_page, 'html.parser')
            for tag in soup(['style', 'script']):
                tag.extract()

            links_to_crawl.extend([
                prepare_url(a['href']) for a in soup.find_all('a')
                if check_a_node(a)
            ])
            try:
                stored_text_file_name = os.path.join(self.storage_dir,
                                                     base64.b16encode(url))
                with open(stored_text_file_name, 'w') as storage_file:
                    storage_file.write(soup.get_text().encode('utf-8'))
                # time.sleep(2)
            except Exception as e:
                logging.error(u'Error occured while crawling {}'.format(
                    current_page_url))
                logging.exception(e)
                error_url_count += 1
            ok_url_count += 1
            crawled_links.add(url)
            if ok_url_count >= self.urls_to_crawl:
                break
        logging.debug('Total time: {}'.format(time.time() - start_time))
コード例 #22
0
def download_voc2007(root):
    path_devkit = os.path.join(root, 'VOCdevkit')
    path_images = os.path.join(root, 'VOCdevkit', 'VOC2007', 'JPEGImages')
    tmpdir = os.path.join(root, 'tmp')

    # create directory
    if not os.path.exists(root):
        os.makedirs(root)

    if not os.path.exists(path_devkit):

        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        parts = urlparse(urls['devkit'])
        filename = os.path.basename(parts.path)
        cached_file = os.path.join(tmpdir, filename)

        if not os.path.exists(cached_file):
            print('Downloading: "{}" to {}\n'.format(urls['devkit'],
                                                     cached_file))
            util.download_url(urls['devkit'], cached_file)

        # extract file
        print('[dataset] Extracting tar file {file} to {path}'.format(
            file=cached_file, path=root))
        cwd = os.getcwd()
        tar = tarfile.open(cached_file, "r")
        os.chdir(root)
        tar.extractall()
        tar.close()
        os.chdir(cwd)
        print('[dataset] Done!')

    # train/val images/annotations
    if not os.path.exists(path_images):

        # download train/val images/annotations
        parts = urlparse(urls['trainval_2007'])
        filename = os.path.basename(parts.path)
        cached_file = os.path.join(tmpdir, filename)

        if not os.path.exists(cached_file):
            print('Downloading: "{}" to {}\n'.format(urls['trainval_2007'],
                                                     cached_file))
            util.download_url(urls['trainval_2007'], cached_file)

        # extract file
        print('[dataset] Extracting tar file {file} to {path}'.format(
            file=cached_file, path=root))
        cwd = os.getcwd()
        tar = tarfile.open(cached_file, "r")
        os.chdir(root)
        tar.extractall()
        tar.close()
        os.chdir(cwd)
        print('[dataset] Done!')

    # test annotations
    test_anno = os.path.join(path_devkit,
                             'VOC2007/ImageSets/Main/aeroplane_test.txt')
    if not os.path.exists(test_anno):

        # download test annotations
        parts = urlparse(urls['test_images_2007'])
        filename = os.path.basename(parts.path)
        cached_file = os.path.join(tmpdir, filename)

        if not os.path.exists(cached_file):
            print('Downloading: "{}" to {}\n'.format(urls['test_images_2007'],
                                                     cached_file))
            util.download_url(urls['test_images_2007'], cached_file)

        # extract file
        print('[dataset] Extracting tar file {file} to {path}'.format(
            file=cached_file, path=root))
        cwd = os.getcwd()
        tar = tarfile.open(cached_file, "r")
        os.chdir(root)
        tar.extractall()
        tar.close()
        os.chdir(cwd)
        print('[dataset] Done!')

    # test images
    test_image = os.path.join(path_devkit, 'VOC2007/JPEGImages/000001.jpg')
    if not os.path.exists(test_image):

        # download test images
        parts = urlparse(urls['test_anno_2007'])
        filename = os.path.basename(parts.path)
        cached_file = os.path.join(tmpdir, filename)

        if not os.path.exists(cached_file):
            print('Downloading: "{}" to {}\n'.format(urls['test_anno_2007'],
                                                     cached_file))
            util.download_url(urls['test_anno_2007'], cached_file)

        # extract file
        print('[dataset] Extracting tar file {file} to {path}'.format(
            file=cached_file, path=root))
        cwd = os.getcwd()
        tar = tarfile.open(cached_file, "r")
        os.chdir(root)
        tar.extractall()
        tar.close()
        os.chdir(cwd)
        print('[dataset] Done!')
コード例 #23
0
ファイル: crawler.py プロジェクト: Anton-Baton/search_engine
	def crawl_wikipedia(self):

		def check_a_node(a):
			if a and a.get('href', None):
				url = a['href']
				ignore_urls_starts = ['/wiki/Wikipedia', '/wiki/Special', '/wiki/Category',
					'/wiki/Template_talk'
					'/wiki/Book', '/wiki/Template', '/wiki/Talk', '/wiki/BookSources', '/wiki/File']
				if url.startswith('/wiki') and not url.split(':')[0] in ignore_urls_starts:
					return True
			return False

		def make_absolute_wiki_url(url):
			return 'https://en.wikipedia.org' + url

		def prepare_url(url):
			return make_absolute_wiki_url(re.split(r'#', a['href'])[0])		

		start_time = time.time()
		current_page_url = self.start_url
		logging.getLogger('requests').setLevel(logging.WARNING)
		logging.debug('Starting to crawl page {}'.format(self.start_url))

		#headers = {'User-Agent': 'SearchingBot 0.1'}	
		ok_url_count = 0
		error_url_count = 0
		url_number = 0
		links_to_crawl = deque()
		links_to_crawl.append(current_page_url)
		crawled_links = set()
		while True:	
			url = links_to_crawl.popleft()
			if not url.startswith('https://en.wikipedia.org') or url in crawled_links:
				continue
			
			if (ok_url_count + error_url_count) % 100 == 0:
				logging.info("Crawled {} oks - {} errors".format(ok_url_count, error_url_count))
			try:
				current_page = download_url(url)
				logging.debug('{}. 200: {}'.format(url_number, url))
			except Exception as e:
				status_code = e.message
				logging.warning('{}. {}: {}'.format(url_number, status_code, url))
				continue
			url_number += 1

			soup = BeautifulSoup(current_page, 'html.parser')
			for tag in soup(['style', 'script']):
				tag.extract()

			links_to_crawl.extend(
				[prepare_url(a['href'])	for a in soup.find_all('a') if check_a_node(a)])
			try:					
				stored_text_file_name = os.path.join(self.storage_dir, base64.b16encode(url))
				with open(stored_text_file_name, 'w') as storage_file:
					storage_file.write(soup.get_text().encode('utf-8'))
				# time.sleep(2)
			except Exception as e:
				logging.error(u'Error occured while crawling {}'.format(current_page_url))
				logging.exception(e)
				error_url_count += 1
			ok_url_count += 1
			crawled_links.add(url)
			if ok_url_count >= self.urls_to_crawl:
				break
		logging.debug('Total time: {}'.format(time.time() - start_time))
コード例 #24
0
def download_errata_checksum():
    """ Download CentOS errata checksum from https://cefs.steve-meier.de/
    """
    res = get_url('https://cefs.steve-meier.de/errata.latest.sha1')
    return download_url(res, 'Downloading Errata Checksum:')