def read(self): if self.hdfs_client is None: if os.path.isdir(self.path): return self.__read_directory_localfs() else: return self.__read_file_path_localfs(filename=self.path) else: if is_hdfs_directory(self.hdfs_client, self.path): return self.__read_directory_hdfs() else: return self.__read_file_path_hdfs(filename=self.path) return
def __read_files_hdfs(self, dataset, read_docs=None): if read_docs is None: read_docs = set() if not is_hdfs_directory(self.hdfs_client, self.directory): filenames = [self.directory] else: filenames = walk_hdfs_directory(self.hdfs_client, self.directory, lambda fname: fname.endswith(".ann.json")) for filename in filenames: with self.hdfs_client.read(filename, encoding="utf-8") as reader: doc_id = self.__read_annjson(reader, filename, dataset) read_docs.add(doc_id) return read_docs