Example #1
0
 def _close(self):
     try:
         self.local_file.seek(0)
         with fsopen(self.kb_path, mode='wb') as kb_file:
             copy_file(self.local_file, kb_file)
     finally:
         self.local_file.close()
Example #2
0
    def _download_to_docs_or_figs(
            self,
            document=None,
            figure=None,
            src_records=(),
            only_new=False,
    ):
        if not document and not figure:
            raise TypeError(
                'No document nor figure passed, at least one is needed.')

        is_document = bool(document)
        doc_or_fig_obj = self._resolve_doc_or_fig_url(
            doc_or_fig_obj=document or figure,
            src_records=src_records,
            only_new=only_new,
        )
        if doc_or_fig_obj['url'].startswith('/api/files/'):
            return self.add_document_or_figure(
                metadata=doc_or_fig_obj,
                key=doc_or_fig_obj['key'],
                is_document=is_document,
            )

        key = doc_or_fig_obj['key']
        if key not in self.files:
            key = self._get_unique_files_key(base_file_name=key)

        stream = fsopen(doc_or_fig_obj['url'], mode='rb')
        return self.add_document_or_figure(
            metadata=doc_or_fig_obj,
            key=key,
            stream=stream,
            is_document=is_document,
        )
Example #3
0
def file_opener(path, mode='r'):
    """File opener.

    param path (str): the fullpath of the file
    param mode (str): mode to open file file
    """
    return fsopen(path, mode=mode)
Example #4
0
 def _close(self):
     try:
         self.local_file.seek(0)
         with fsopen(self.kb_path, mode='wb') as kb_file:
             copy_file(self.local_file, kb_file)
     finally:
         self.local_file.close()
Example #5
0
def retrieve_uri(uri, outdir=None):
    """Retrieves the given uri and stores it in a temporary file."""
    with tempfile.NamedTemporaryFile(prefix='inspire', dir=outdir) as local_file, \
            fsopen(uri, mode='rb') as remote_file:
        copy_file(remote_file, local_file)

        local_file.flush()
        yield local_file.name
Example #6
0
def retrieve_uri(uri, outdir=None):
    """Retrieves the given uri and stores it in a temporary file."""
    with tempfile.NamedTemporaryFile(prefix='inspire', dir=outdir) as local_file, \
            fsopen(uri, mode='rb') as remote_file:
        copy_file(remote_file, local_file)

        local_file.flush()
        yield local_file.name
Example #7
0
def retrieve_uri(uri, outdir=None):
    """Retrieves the given uri and stores it in a temporary file."""
    local_file = tempfile.NamedTemporaryFile(
        prefix='inspire',
        dir=outdir,
        delete=False,
    )

    try:
        with fsopen(uri, mode='rb') as remote_file:
            copy_file(remote_file, local_file)

    finally:
        local_file.close()

    return local_file.name
Example #8
0
    def _download_file_from_url(self, url):
        """Downloads file and calculates hash for it

        If everything is ok then adds it to files in current record.

        If file with same hash already found in db, tries to use this one

        instead of creating duplicate (uses `ObjectVersion.copy()` method)

        Args:
            url (str): Local or remote url/filepath
        Returns:
            str: key(sha-1) of downloaded file
        Raises:
            ValueError: can be raised in `self.hash_data` method if no data is provided

        Example:
            >>> self._download_file_from_url('http://example.com/url_to_file.pdf')
            '207611e7bf8a83f0739bb2e16a1a7cf0d585fb5f'
        """
        stream = fsopen(url, mode="rb")
        # TODO: change to stream.read() when fs will be updated to >= 2.0
        # As HTTPOpener is not working with size = -1
        # (and read() method sets this size as default)
        # This is workaround until we will update to fs >2.0
        data = stream._f.wrapped_file.read()
        key = self.hash_data(data=data)
        if key not in self.files.keys:
            file = self._find_local_file(key=key)
            new_key = None
            if file:
                LOGGER.debug("Same file found locally, trying to copy",
                             uuid=self.id)
                try:
                    new_key = self._copy_local_file(file, key)
                except ValueError:
                    pass
                except AttributeError:
                    pass
            if not new_key:
                LOGGER.debug("Adding file to record", key=key, uuid=self.id)
                self.files[key] = BytesIO(data)
        else:
            LOGGER.debug("File already attached to record",
                         key=key,
                         uuid=self.id)
        return key
Example #9
0
def extract_world_archive(event, context, flog):
    flog.info('Starting world archive extraction...')

    bucket_name = event['bucket']['name']
    object_key = event['object']['key']

    flog.debug('Event object: %s::%s', bucket_name, object_key)

    # TODO: error handling
    api_key = os.path.splitext(os.path.split(object_key)[1])[0]
    world = World.select().where(World.api_key == api_key).get()
    user = world.user

    flog.info('Extracting for user::world: %s:%s', user.guid, world.guid)

    object_fd = fsopen('s3://{bucket}/{key}'.format(
        bucket=bucket_name,
        key=object_key,
    ), 'rb')
    archive_fs = ZipFS(object_fd, 'r')
    dest_fs = fsopendir('s3://{bucket}/'.format(bucket=bucket_name))
    dest_prefix = 'worlds/{user_guid}/{world_guid}/'.format(
        user_guid=user.guid,
        world_guid=world.guid,
    )

    for fn in archive_fs.walkfiles(wildcard='level.dat'):
        level_dat_fn = fn
        break
    flog.debug('Found level.dat at: %s', level_dat_fn)

    archive_fs = archive_fs.opendir(os.path.dirname(level_dat_fn))

    flog.info('Extracting level.dat')
    # TODO: make sure these paths are actually safe
    dest_fs.setcontents(
        safe_path_join(dest_prefix, 'level.dat'),
        archive_fs.getcontents('level.dat'))
    for region_fn in archive_fs.walkfiles(wildcard='*.mca'):
        flog.info('Extracting file: %s', region_fn)
        dest_fs.setcontents(
            safe_path_join(dest_prefix, region_fn),
            archive_fs.getcontents(region_fn))

    flog.info('Finished world archive extraction')
Example #10
0
def render_region_heightmap(event, context, flog):
    src_bucket = event['bucket']['name']
    src_obj_key = event['object']['key']

    dest_vfs = fsopendir('s3://{bucket}/'.format(bucket='quarry-output'))
    dest_image_fn = os.path.join('heightmaps', *src_obj_key.split('/')[1:]) + '.png'
    src_region = RegionFile(fileobj=fsopen('s3://{bucket}/{key}'.format(
        bucket=src_bucket, key=src_obj_key), 'rb'))

    img = Image.new('L', (512, 512))

    for chunk in src_region.get_metadata():
        chunk_data = src_region.get_nbt(chunk.x, chunk.z)
        heightmap_data = numpy.array(chunk_data['Level']['HeightMap'],
            dtype=numpy.uint8).reshape((16, 16))
        img.paste(Image.fromarray(heightmap_data),
            box=(chunk.x * 16, chunk.z * 16))

    with dest_vfs.open(dest_image_fn, 'wb') as image_handle:
        img.save(image_handle, format='PNG')
Example #11
0
    def _download_to_docs_or_figs(
        self,
        document=None,
        figure=None,
        src_records=(),
        only_new=False,
    ):
        if not document and not figure:
            raise TypeError(
                'No document nor figure passed, at least one is needed.'
            )

        is_document = bool(document)
        doc_or_fig_obj = self._resolve_doc_or_fig_url(
            doc_or_fig_obj=document or figure,
            src_records=src_records,
            only_new=only_new,
        )
        if doc_or_fig_obj['url'].startswith('/api/files/'):
            return self.add_document_or_figure(
                metadata=doc_or_fig_obj,
                key=doc_or_fig_obj['key'],
                is_document=is_document,
            )

        key = doc_or_fig_obj['key']
        if key not in self.files:
            key = self._get_unique_files_key(base_file_name=key)

        url = doc_or_fig_obj['url']
        scheme = urlparse(url).scheme
        if scheme == 'file':
            url = unquote(url)

        stream = fsopen(url, mode='rb')
        return self.add_document_or_figure(
            metadata=doc_or_fig_obj,
            key=key,
            stream=stream,
            is_document=is_document,
        )
Example #12
0
def upload(service):
    if 'files' in request.form:
        session['return_url'] = request.form['return_url']
        files = request.form['files']
        session['files_to_upload'] = files[2:-2].split("', '")

    filesystem = _build_file_system(service)

    files = session.pop('files_to_upload')
    from invenio.legacy.bibdocfile.api import bibdocfile_url_to_bibdocfile

    try:
        for one in files:
            docfile = bibdocfile_url_to_bibdocfile(one)
            f = fsopen(docfile.get_full_path(), 'r')
            n = filesystem.open(docfile.get_full_name(), "w")
            n.write(f.read())
            n.close()
        flash("All files uploaded successfully", 'info')
    except:
        flash("Something went wrong, please try again", 'error')

    return redirect(session.pop('return_url'))
Example #13
0
def copy_file_to_workflow(workflow, name, url):
    url = unquote(url)
    stream = fsopen(url, mode='rb')
    workflow.files[name] = stream
    return workflow.files[name]
Example #14
0
def get_source(spec, cache_fs,  account_accessor=None, clean=False, logger=None, cwd=None, callback=None):
    """
    Download a file from a URL and return it wrapped in a row-generating acessor object.

    :param cwd: Current working directory, for relative file:: urls.
    :param spec: A SourceSpec that describes the source to fetch.
    :param cache_fs: A pyfilesystem filesystem to use for caching downloaded files.
    :param account_accessor: A callable to return the username and password to use for access FTP and S3 URLs.
    :param clean: Delete files in cache and re-download.
    :param logger: A logger, for logging.
    :param callback: A callback, called while reading files in download. signatire is f(read_len, total_len)

    :return: a SourceFile object.
    """
    from fs.zipfs import ZipOpenError
    import os

    # FIXME. urltype should be moved to reftype.
    url_type = spec.get_urltype()

    def do_download():

        return download(spec.url, cache_fs, account_accessor, clean=clean, logger=logger, callback=callback)

    if url_type == 'file':

        from fs.opener import  fsopen

        syspath = spec.url.replace('file://','')
        cache_path = syspath.replace('/','_').strip('_')

        fs_path = os.path.join(cwd, syspath)



        contents = fsopen(fs_path).read()
        cache_fs.setcontents(cache_path, contents)

    elif url_type not in ('gs', 'socrata'): #FIXME. Need to clean up the logic for gs types.
        try:
            cache_path, download_time = do_download()
            spec.download_time = download_time
        except HTTPError as e:
            raise DownloadError("Failed to download {}; {}".format(spec.url, e))
    else:
        cache_path, download_time = None, None

    if url_type == 'zip':
        try:
            fstor = extract_file_from_zip(cache_fs, cache_path, spec.url, spec.file)
        except ZipOpenError:
            # Try it again
            cache_fs.remove(cache_path)
            cache_path, spec.download_time = do_download()
            fstor = extract_file_from_zip(cache_fs, cache_path, spec.url, spec.file)

        file_type = spec.get_filetype(fstor.path)

    elif url_type == 'gs':
        fstor = get_gs(spec.url, spec.segment, account_accessor)
        file_type = 'gs'

    elif url_type == 'socrata':
        spec.encoding = 'utf8'
        spec.header_lines = [0]
        spec.start_line = 1
        url = SocrataSource.download_url(spec)
        fstor = DelayedDownload(url, cache_fs)
        file_type = 'socrata'

    else:
        fstor = DelayedOpen(cache_fs, cache_path, 'rb')
        file_type = spec.get_filetype(fstor.path)

    spec.filetype = file_type

    TYPE_TO_SOURCE_MAP = {
        'gs': GoogleSource,
        'csv': CsvSource,
        'tsv': TsvSource,
        'fixed': FixedSource,
        'txt': FixedSource,
        'xls': ExcelSource,
        'xlsx': ExcelSource,
        'partition': PartitionSource,
        'shape': ShapefileSource,
        'socrata': SocrataSource
    }



    cls = TYPE_TO_SOURCE_MAP.get(file_type)
    if cls is None:
        raise SourceError(
            "Failed to determine file type for source '{}'; unknown type '{}' "
            .format(spec.name, file_type))

    return cls(spec, fstor)
Example #15
0
    def _flush(self):
        with fsopen(self.kb_path, mode='wb') as fd:
            fd.write(''.join(self.data_buffer))

        self.data_buffer = []
Example #16
0
def copy_file_to_workflow(workflow, name, url):
    url = unquote(url)
    stream = fsopen(url, mode='rb')
    workflow.files[name] = stream
    return workflow.files[name]