Exemple #1
0
    def register_directory(self, dirpath, **kwargs):
        """
        Registers all of the files in the the directory path
        """

        kwargs['file_extensions'] = kwargs.get("file_extensions",
                                               self.rdf_formats)
        files = list_files(file_directory=dirpath, **kwargs)
        for fileinfo in files:
            self.register_rml(fileinfo[-1], **kwargs)
    def load_directory(self, directory, **kwargs):
        """ loads all rdf files in a directory

        args:
            directory: full path to the directory
        """
        log.setLevel(kwargs.get("log_level", self.log_level))
        conn = self.__get_conn__(**kwargs)
        file_extensions = kwargs.get('file_extensions', conn.rdf_formats)
        file_list = list_files(directory,
                               file_extensions,
                               kwargs.get('include_subfolders', False),
                               include_root=True)
        for file in file_list:
            self.load_file(file[1], **kwargs)
        log.setLevel(self.log_level)
    def load_directory(self, directory, **kwargs):
        """ loads all rdf files in a directory

        args:
            directory: full path to the directory
        """
        log.setLevel(kwargs.get("log_level", self.log_level))
        conn = self.__get_conn__(**kwargs)
        file_extensions = kwargs.get('file_extensions', conn.rdf_formats)
        file_list = list_files(directory,
                               file_extensions,
                               kwargs.get('include_subfolders', False),
                               include_root=True)
        for file in file_list:
            self.load_file(file[1], **kwargs)
        log.setLevel(self.log_level)
    def bulk_load(self, **kwargs):
        """ Uploads data to the Blazegraph Triplestore that is stored in files
            that are in a local directory

            kwargs:
                file_directory: a string path to the file directory
                file_extensions: a list of file extensions to filter
                        example ['xml', 'rdf']. If none include all files
                include_subfolders: as implied
                namespace: the Blazegraph namespace to load the data
                graph: uri of the graph to load the data. Default is None
                create_namespace: False(default) or True will create the
                        namespace if it does not exist
        """
        namespace = kwargs.get('namespace', self.namespace)
        graph = kwargs.get('graph', self.graph)
        if kwargs.get('reset') == True:
            self.reset_namespace()
        file_directory = kwargs.get('file_directory', self.local_directory)
        file_extensions = kwargs.get('file_extensions', self.rdf_formats)
        root_dir = kwargs.get('root_dir', self.local_directory)
        file_list = list_files(file_directory,
                               file_extensions,
                               kwargs.get('include_subfolders', True),
                               include_root=kwargs.get('include_root', False),
                               root_dir=root_dir)
        path_parts = [' ']
        if self.container_dir:
            path_parts.append(self.container_dir)
        file_or_dirs = ",".join([
            os.path.join(os.path.join(*path_parts), file[1])
            for file in file_list
        ]).strip()
        file_or_dirs = "/alliance_data"
        _params = BULK_LOADER_PARAMS.copy()
        params = {
            'namespace': kwargs.get('namespace', self.namespace),
            'file_or_dirs': file_or_dirs,
        }

        _params.update(params)
        time_start = datetime.datetime.now()

        log.info(" starting load of '%s' files into namespace '%s'",
                 len(file_list), params['namespace'])
        new_params = {key: json.dumps(value) \
                      for key, value in _params.items() \
                      if not isinstance(value, str)}
        new_params.update({key: value \
                           for key, value in _params.items() \
                           if isinstance(value, str)})
        data = BULK_LOADER_XML.format(**new_params)
        # data = BULK_LOADER_XML2
        print(data)
        pdb.set_trace()
        url = os.path.join(self.url, 'dataloader')
        result = requests.post(url=url,
                               headers={"Content-Type": 'application/xml'},
                               data=data)
        failed_list = list_files(file_directory, ['fail'],
                                 kwargs.get('include_subfolders', True),
                                 include_root=kwargs.get(
                                     'include_root', False),
                                 root_dir=root_dir)
        failed_list = [file for file in failed_list \
                       if file[0].split(".")[-2] in file_extensions]
        good_list = list_files(file_directory, ['good'],
                               kwargs.get('include_subfolders', True),
                               include_root=kwargs.get('include_root', False),
                               root_dir=root_dir)
        # pdb.set_trace()
        log.info(" bulk_load results: %s\nThe following files successfully loaded: \n\t%s",
                 result.text,
                 "\n\t".join([os.path.splitext(file[1])[0] \
                              for file in good_list]))
        if failed_list:
            log.warning("The following files failed to load:\n\t%s",
                        "\n\t".join([file[1] for file in failed_list]))
            log.info(" Attempting load via alt method ***")
            for file in failed_list:
                os.rename(os.path.join(root_dir, file[1]),
                          os.path.join(root_dir,
                                       os.path.splitext(file[1])[0]))
                self.load_local_file(
                    os.path.splitext(file[1])[0], namespace, graph)
        # restore file names
        files = list_files(file_directory, ['good', 'fail'],
                           kwargs.get('include_subfolders', True),
                           include_root=kwargs.get('include_root', False),
                           root_dir=root_dir)
        [os.rename(os.path.join(root_dir, file[1]),
                   os.path.join(root_dir, os.path.splitext(file[1])[0])) \
         for file in files]
    def load_directory(self, method='data_stream', **kwargs):
        """ Uploads data to the Blazegraph Triplestore that is stored in files
            that are in a local directory

            kwargs:
                method['local', 'data_stream']: 'local' uses the container dir
                        'data_stream': reads the file and sends it as part of
                        http request
                file_directory: a string path to the file directory to start
                        the search
                container_dir: the path that the triplestore container sees
                root_dir: root directory to be removed from the file paths
                        for example:
                              file_directory: this is as seen from python app
                                  /example/python/data/dir/to/search
                              container_dir: this is the path as seen from the
                                  triplestore
                                  /data
                              root_dir: the portion of the path to remove so
                                  both directories match
                                  /example/python/data
                file_extensions: a list of file extensions to filter
                        example ['xml', 'rdf']. If none include all files
                include_subfolders: as implied
                namespace: the Blazegraph namespace to load the data
                graph: uri of the graph to load the data. Default is None
                create_namespace: False(default) or True will create the
                        namespace if it does not exist
                use_threading(bool): Whether to use threading or not
        """

        if kwargs.get('reset') == True:
            self.reset_namespace()
        namespace = kwargs.get('namespace', self.namespace)
        container_dir = kwargs.get('container_dir', self.container_dir)
        graph = kwargs.get('graph')
        time_start = datetime.datetime.now()
        include_root = kwargs.get('include_root', False)
        if method == 'data_stream':
            include_root = True
        file_directory = kwargs.get('file_directory', self.local_directory)
        file_extensions = kwargs.get('file_extensions', self.rdf_formats)
        file_list = list_files(file_directory,
                               file_extensions,
                               kwargs.get('include_subfolders', True),
                               include_root=include_root,
                               root_dir=kwargs.get('root_dir',
                                                   self.local_directory))
        log.info(" starting load of '%s' files into namespace '%s'",
                 len(file_list), self.namespace)
        if kwargs.get('create_namespace') and namespace:
            if not self.has_namespace(namespace):
                self.create_namespace(namespace)
        if not self.has_namespace(namespace):
            msg = "".join([
                "Namespace '%s' does not exist. " % namespace,
                "Pass kwarg 'create_namespace=True' to ",
                "auto-create the namespace."
            ])
            raise ValueError(msg)
        params = {}
        for file in file_list:
            if kwargs.get('use_threading') == True:
                if method == 'data_stream':
                    th = threading.Thread(name=file[1],
                                          target=self.load_data,
                                          args=(
                                              file[1],
                                              None,
                                              namespace,
                                              graph,
                                              True,
                                          ))
                else:
                    th = threading.Thread(name=file[1],
                                          target=self.load_local_file,
                                          args=(
                                              file[1],
                                              namespace,
                                              graph,
                                          ))
                th.start()
            else:
                if method == 'data_stream':
                    self.load_data(data=file[1],
                                   namespace=namespace,
                                   graph=graph,
                                   is_file=True)
                else:
                    self.load_local_file(file[1], namespace, graph)
        if kwargs.get('use_threading') == True:
            main_thread = threading.main_thread()
            for t in threading.enumerate():
                if t is main_thread:
                    continue
                t.join()
        log.info("%s file(s) loaded in: %s", len(file_list),
                 datetime.datetime.now() - time_start)
    def bulk_load(self, **kwargs):
        """ Uploads data to the Blazegraph Triplestore that is stored in files
            that are in a local directory

            kwargs:
                file_directory: a string path to the file directory
                file_extensions: a list of file extensions to filter
                        example ['xml', 'rdf']. If none include all files
                include_subfolders: as implied
                namespace: the Blazegraph namespace to load the data
                graph: uri of the graph to load the data. Default is None
                create_namespace: False(default) or True will create the
                        namespace if it does not exist
        """
        namespace = kwargs.get('namespace', self.namespace)
        graph = kwargs.get('graph', self.graph)
        if kwargs.get('reset') == True:
            self.reset_namespace()
        file_directory = kwargs.get('file_directory', self.local_directory)
        file_extensions = kwargs.get('file_extensions', self.rdf_formats)
        root_dir = kwargs.get('root_dir', self.local_directory)
        file_list = list_files(file_directory,
                               file_extensions,
                               kwargs.get('include_subfolders', True),
                               include_root=kwargs.get('include_root', False),
                               root_dir=root_dir)
        path_parts = [' ']
        if self.container_dir:
            path_parts.append(self.container_dir)
        file_or_dirs = ",".join([os.path.join(os.path.join(*path_parts),file[1])
                                 for file in file_list]).strip()
        file_or_dirs = "/alliance_data"
        _params = BULK_LOADER_PARAMS.copy()
        params = {
                    'namespace': kwargs.get('namespace', self.namespace),
                    'file_or_dirs': file_or_dirs,
                 }

        _params.update(params)
        time_start = datetime.datetime.now()

        log.info(" starting load of '%s' files into namespace '%s'",
                 len(file_list),
                 params['namespace'])
        new_params = {key: json.dumps(value) \
                      for key, value in _params.items() \
                      if not isinstance(value, str)}
        new_params.update({key: value \
                           for key, value in _params.items() \
                           if isinstance(value, str)})
        data = BULK_LOADER_XML.format(**new_params)
        # data = BULK_LOADER_XML2
        print(data)
        pdb.set_trace()
        url = os.path.join(self.url, 'dataloader')
        result = requests.post(url=url,
                               headers={"Content-Type": 'application/xml'},
                               data=data)
        failed_list = list_files(file_directory,
                                 ['fail'],
                                 kwargs.get('include_subfolders', True),
                                 include_root=kwargs.get('include_root', False),
                                 root_dir=root_dir)
        failed_list = [file for file in failed_list \
                       if file[0].split(".")[-2] in file_extensions]
        good_list = list_files(file_directory,
                               ['good'],
                               kwargs.get('include_subfolders', True),
                               include_root=kwargs.get('include_root', False),
                               root_dir=root_dir)
        # pdb.set_trace()
        log.info(" bulk_load results: %s\nThe following files successfully loaded: \n\t%s",
                 result.text,
                 "\n\t".join([os.path.splitext(file[1])[0] \
                              for file in good_list]))
        if failed_list:
            log.warning("The following files failed to load:\n\t%s",
                     "\n\t".join([file[1] for file in failed_list]))
            log.info(" Attempting load via alt method ***")
            for file in failed_list:
                os.rename(os.path.join(root_dir, file[1]),
                          os.path.join(root_dir, os.path.splitext(file[1])[0]))
                self.load_local_file(os.path.splitext(file[1])[0],
                                     namespace,
                                     graph)
        # restore file names
        files = list_files(file_directory,
                           ['good','fail'],
                           kwargs.get('include_subfolders', True),
                           include_root=kwargs.get('include_root', False),
                           root_dir=root_dir)
        [os.rename(os.path.join(root_dir, file[1]),
                   os.path.join(root_dir, os.path.splitext(file[1])[0])) \
         for file in files]
    def load_directory(self, method='data_stream', **kwargs):
        """ Uploads data to the Blazegraph Triplestore that is stored in files
            that are in a local directory

            kwargs:
                method['local', 'data_stream']: 'local' uses the container dir
                        'data_stream': reads the file and sends it as part of
                        http request
                file_directory: a string path to the file directory to start
                        the search
                container_dir: the path that the triplestore container sees
                root_dir: root directory to be removed from the file paths
                        for example:
                              file_directory: this is as seen from python app
                                  /example/python/data/dir/to/search
                              container_dir: this is the path as seen from the
                                  triplestore
                                  /data
                              root_dir: the portion of the path to remove so
                                  both directories match
                                  /example/python/data
                file_extensions: a list of file extensions to filter
                        example ['xml', 'rdf']. If none include all files
                include_subfolders: as implied
                namespace: the Blazegraph namespace to load the data
                graph: uri of the graph to load the data. Default is None
                create_namespace: False(default) or True will create the
                        namespace if it does not exist
                use_threading(bool): Whether to use threading or not
        """

        if kwargs.get('reset') == True:
            self.reset_namespace()
        namespace = kwargs.get('namespace', self.namespace)
        container_dir = kwargs.get('container_dir', self.container_dir)
        graph = kwargs.get('graph')
        time_start = datetime.datetime.now()
        include_root = kwargs.get('include_root', False)
        if method == 'data_stream':
            include_root = True
        file_directory = kwargs.get('file_directory', self.local_directory)
        file_extensions = kwargs.get('file_extensions', self.rdf_formats)
        file_list = list_files(file_directory,
                               file_extensions,
                               kwargs.get('include_subfolders', True),
                               include_root=include_root,
                               root_dir=kwargs.get('root_dir',
                                                   self.local_directory))
        log.info(" starting load of '%s' files into namespace '%s'",
                 len(file_list),
                 self.namespace)
        if kwargs.get('create_namespace') and namespace:
            if not self.has_namespace(namespace):
                self.create_namespace(namespace)
        if not self.has_namespace(namespace):
            msg = "".join(["Namespace '%s' does not exist. " % namespace,
                           "Pass kwarg 'create_namespace=True' to ",
                           "auto-create the namespace."])
            raise ValueError(msg)
        params = {}
        for file in file_list:
            if kwargs.get('use_threading') == True:
                if method == 'data_stream':
                    th = threading.Thread(name=file[1],
                                          target=self.load_data,
                                          args=(file[1],
                                                None,
                                                namespace,
                                                graph,
                                                True,))
                else:
                    th = threading.Thread(name=file[1],
                                          target=self.load_local_file,
                                          args=(file[1],
                                                namespace,
                                                graph,))
                th.start()
            else:
              if method == 'data_stream':
                  self.load_data(data=file[1],
                                 namespace=namespace,
                                 graph=graph,
                                 is_file=True)
              else:
                  self.load_local_file(file[1], namespace, graph)
        if kwargs.get('use_threading') == True:
            main_thread = threading.main_thread()
            for t in threading.enumerate():
                if t is main_thread:
                    continue
                t.join()
        log.info("%s file(s) loaded in: %s",
                 len(file_list),
                 datetime.datetime.now() - time_start)