Esempio n. 1
0
    def on_moved(self, event):
        """ for non-empty dir: the db records of its files and subdirectories are also modified
        the dir file structure does not exist locally anymore so need to find it through calls to HDFS to update all
        related db records (local + remote path)
        for link: the corresponding remote file should be moved + the path in the link file should be changed
        """
        print("on_moved")
        try:
            rem_src_path = self.lc.get_remote_file_path(event.src_path)
            tmp = customize_path(
                self.local.remotePath,
                remove_prefix(self.local.localPath, event.dest_path))
            rem_dest_path = rm_link_extension(tmp)
            existing_main_obj = MRBoxObject(event.dest_path,
                                            self.local.localFileLimit,
                                            rem_src_path)
            if rem_src_path is not None:

                if existing_main_obj.is_dir():
                    # get all the remote paths of the files in the dir
                    remote_src_paths = self.hadoop.find_remote_paths(
                        rem_src_path)
                    local_remote_tuples = []
                    for rp in remote_src_paths:
                        if rp == rem_src_path:
                            local_remote_tuples.append(
                                (rp, event.dest_path, rem_dest_path))
                        else:
                            file_hierarchy = remove_prefix(rem_src_path, rp)
                            new_remote_path = customize_path(
                                rem_dest_path, file_hierarchy)
                            loc_type = self.lc.get_loc_type_by_remote_path(rp)
                            new_local_path = to_link(
                                customize_path(event.dest_path,
                                               file_hierarchy), loc_type)
                            local_remote_tuples.append(
                                (rp, new_local_path, new_remote_path))
                            # modify links' content
                            existing_obj = MRBoxObject(
                                new_local_path, self.local.localFileLimit, rp)
                            existing_obj.replace_loc_content(new_remote_path)
                    self.lc.update_by_remote_path(local_remote_tuples)

                else:
                    existing_main_obj.replace_loc_content(rem_dest_path)
                    self.lc.update_by_remote_path([
                        (rem_src_path, event.dest_path, rem_dest_path)
                    ])

                self.hadoop.mv(rem_src_path, rem_dest_path)
        except FileNotFoundError:
            print("Move already handled!")
Esempio n. 2
0
 def find_remote_paths(self, starting_path):
     """
     :param starting_path: the remote path of the file / dir that was deleted locally
     :param hdfs: connection to hdfs
     :return: list of remote paths of dirs + files in the file structure from starting_path (without /starting_path/)
     """
     print("find_remote_paths")
     list_of_paths = [starting_path]
     for sp, subdir, files in self.walk(starting_path):
         for name in subdir:
             list_of_paths.append(customize_path(sp, name))
         for name in files:
             list_of_paths.append(customize_path(sp, name))
     return list_of_paths
Esempio n. 3
0
    def create_locally_synced_dir(self, cmd, lc, mrbox_dir):  # ok!
        """
        Creates a dir on hdfs by running the cmd command and creates a copy of it locally
        :param cmd: bash command to create hdfs dir
        :param lc: sqlite3 db class instance
        :param mrbox_dir: MRBox file object with the info regarding the dir that will be created locally + on HDFS
        :return:
        """
        # create on hdfs --> tracked file: put on db --> create locally
        print("create locally synced dir")
        subprocess.run(cmd, shell=True, check=True)
        hdfs_chk = hdfs_file_checksum(self.hadoopPath, mrbox_dir.remotePath,
                                      mrbox_dir.remoteType)
        lc.insert_tuple_hdfs(mrbox_dir.localPath, mrbox_dir.remotePath,
                             hdfs_chk, mrbox_dir.localType)
        os.mkdir(
            mrbox_dir.localPath
        )  # creates an empty directory of hdfs outputs locally, triggers on_created()
        print("folder created")

        for rp in self.ls(mrbox_dir.remotePath):
            hdfs_chk = hdfs_file_checksum(self.hadoopPath, rp, 'file')
            file_size = hdfs_file_size(self.hadoopPath, rp)
            f = remove_prefix(mrbox_dir.remotePath, rp)
            lp = customize_path(mrbox_dir.localPath, f)
            mrbox_file = MRBoxObject(lp, mrbox_dir.localFileLimit, rp,
                                     file_size, 'file')

            # todo: insert in batch
            lc.insert_tuple_hdfs(mrbox_file.localPath, mrbox_file.remotePath,
                                 hdfs_chk, mrbox_file.localType)
            mrbox_file.file_info()
            self.get(mrbox_file)
Esempio n. 4
0
    def on_created(self, event):
        """ Creates dir / file on HDFS & adds mapping with mapping between local + hdfs path in the local db
        If created file is .yaml issues a MR job"""
        print("on_created")

        if self.lc.check_local_path_exists(event.src_path):
            print("file/dir already exists on hdfs - mapped on db")
            remote_file_path = self.lc.get_remote_file_path(event.src_path)
            obj = MRBoxObject(event.src_path, self.local.localFileLimit,
                              remote_file_path)  # do we want remote file size?
            # obj.file_info()
            # update needed to insert the loc_chk in existent db record
            # in case of link: loc_chk != hdfs_chk
            loc_chk = crc32c_file_checksum(obj.localPath, obj.localType)
            self.lc.update_tuple_local(obj.localPath, loc_chk)
        else:
            print("file/dir needs to be created on hdfs - not mapped on db")
            filename = remove_prefix(self.local.localPath, event.src_path)
            remote_file_path = customize_path(self.local.remotePath, filename)
            obj = MRBoxObject(event.src_path, self.local.localFileLimit,
                              remote_file_path)
            # obj.file_info()
            loc_chk = crc32c_file_checksum(obj.localPath, obj.localType)
            self.lc.insert_tuple_local(obj.localPath, obj.remotePath, loc_chk,
                                       obj.localType)

        if not self.hadoop.exists(remote_file_path) and obj.is_dir():
            print("creating dir on hdfs")
            self.hadoop.mkdir(remote_file_path)
            hdfs_chk = hdfs_file_checksum(self.hadoop.hadoopPath,
                                          obj.remotePath, obj.localType)
            self.lc.update_tuple_hdfs(obj.localPath, hdfs_chk)

        if not self.hadoop.exists(remote_file_path) and obj.is_file():
            print("creating file on hdfs")
            self.hadoop.put(obj.localPath, obj.remotePath)
            hdfs_chk = hdfs_file_checksum(self.hadoop.hadoopPath,
                                          obj.remotePath, obj.localType)
            self.lc.update_tuple_hdfs(obj.localPath, hdfs_chk)

        # if it is a link, it already exists

        # compare_local_hdfs_copy(self.lc, event.src_path)

        if obj.is_file() and event.src_path.endswith('.yaml'):
            self.issue_mr_job(obj.localPath)
Esempio n. 5
0
def hdfs_file_size(hadoop_path, hdfs_filepath):  # todo: how to handle dirs
    """
    Returns the size of a hadoop file in bytes
    :param hadoop_path:
    :param hdfs_filepath:
    :return:
    """
    cmd_hdfs_file_size = customize_path(
        hadoop_path, 'bin/hdfs') + " dfs -ls " + hdfs_filepath
    res = subprocess.run(cmd_hdfs_file_size,
                         shell=True,
                         check=True,
                         capture_output=True,
                         text=True)
    res = res.stdout
    file_size = res.split()[4]
    print("HDFS file size in bytes: " + file_size)
    return int(file_size)
Esempio n. 6
0
    def issue_mr_job(self, filepath):
        """
        Called when a .yaml file is created.
        Reads the paths of mapper, reducer, input dir, output dir + checks that they exist locally + remotely.
        Issues the MR job.
        :param filepath: the path of the created yaml file, all specified paths are local
        :return:
        """
        print("issue_mr_job")

        with open(filepath, 'r') as f:
            data = yaml.load(f, Loader=yaml.FullLoader)
            mapper_path = data.get('mapper')
            reducer_path = data.get('reducer')
            input_path = customize_path(self.local.localPath,
                                        data.get('input'))
            print("input_path: " + input_path)
            output_path = data.get('output')

        # check if the files exists locally
        for f in [mapper_path, reducer_path, input_path]:
            if not os.path.exists(f):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT), f)

        # to issue MR job, the input file should be on hdfs --> need to get the remote path
        hdfs_input_path = self.lc.get_remote_file_path(
            customize_path(self.local.localPath, input_path))
        print("hdfs_input_path: " + hdfs_input_path)

        # need to generate local + remote output paths
        local_output_path = customize_path(self.local.localPath, output_path)
        hdfs_output_path = customize_path(self.local.remotePath, output_path)

        # issue MR job
        cmd_mr = customize_path(self.hadoop.hadoopPath, 'bin/hadoop') + " jar " \
                 + customize_path(self.hadoop.hadoopPath, 'share/hadoop/tools/lib/hadoop-streaming-3.2.0.jar') \
                 + " -files " + mapper_path + "," + reducer_path + " -mapper 'mapper.py'" + " -reducer 'reducer.py'" \
                 + " -input " + hdfs_input_path + " -output " + hdfs_output_path

        try:
            output_dir = MRBoxObject(local_output_path,
                                     self.local.localFileLimit,
                                     hdfs_output_path,
                                     remote_file_type='dir')
            self.hadoop.create_locally_synced_dir(cmd_mr, self.lc, output_dir)
        except subprocess.CalledProcessError as e:
            print("Map-Reduce job failed!")
            print(e.output)
Esempio n. 7
0
def hdfs_file_checksum(hadoop_path, hdfs_filepath, ftype):
    """
    Computes the checksum of a file on hdfs
    :param hadoop_path: where hadoop is installed locally
    :param hdfs_filepath: the path of the file on hdfs
    :param ftype: the type of the local copy of the file ('dir', 'file', 'link')
    :return:
    """
    if ftype == 'dir':
        return None
    cmd_hdfs_chk = customize_path(hadoop_path, 'bin/hdfs') + \
                   " dfs -Ddfs.checksum.combine.mode=COMPOSITE_CRC -checksum " + hdfs_filepath
    res = subprocess.run(cmd_hdfs_chk,
                         shell=True,
                         check=True,
                         capture_output=True,
                         text=True)
    res = res.stdout
    prefix = hdfs_filepath + "\t" + "COMPOSITE-CRC32C\t"
    return res[len(prefix):].rstrip("\n")
Esempio n. 8
0
    app_name = "mrbox"
    config_file = app_name + ".conf"
    config_folder = os.path.dirname(os.path.realpath(__file__))
    config_filepath = os.path.join(config_folder, config_file)

    # check if a configuration file exists in current path
    if not os.path.exists(config_filepath):
        print("No .conf file is found in %s." % config_folder)
        sys.exit(1)

    # read from mrbox.conf
    config = configparser.ConfigParser()
    config.read(config_filepath)

    # local folder properties
    local_folder = customize_path(config['User']['localPath'], 'mrbox')
    local_file_size_limit_MB = config['User']['localFileSizeMB']
    remote_folder = customize_path(config['User']['hdfsPath'], 'mrbox')
    if not os.path.exists(local_folder):
        os.mkdir(local_folder)
    local_file_size_limit_bytes = bytes_to_mb(int(local_file_size_limit_MB))
    local = MRBoxObject(local_folder, local_file_size_limit_bytes,
                        remote_folder)

    # connect to hdfs and create hadoop interface, todo: check how to create list of multiple hadoops
    hdfs_con = HDFileSystem(host=config['User']['hdfsHost'],
                            port=config['User'].getint('hdfsPort'))
    hadoop_path = config['User']['hadoopPath']
    hdfs_con.mkdir(remote_folder)
    hadoop = HadoopInterface(hdfs_con, hadoop_path)
Esempio n. 9
0
 def tail(self, hdfs_path):
     cmd = customize_path(self.hadoopPath,
                          'bin/hdfs') + " dfs -tail " + hdfs_path
     subprocess.run(cmd, shell=True, check=True)
Esempio n. 10
0
def main(argv):
    app_name = "mrbox"
    config_file = app_name + ".conf"
    config_folder = os.path.dirname(os.path.realpath(__file__))
    config_filepath = os.path.join(config_folder, config_file)

    # check if a configuration file exists in current path
    if not os.path.exists(config_filepath):
        print("No .conf file is found in %s." % config_folder)
        sys.exit(1)

    # parse arguments
    if len(argv) == 0 or len(argv) > 2:
        print(
            "Wrong number of operands.\nTry 'mrview.py help' for more information."
        )
        sys.exit(1)
    elif argv[0] == 'help':
        print("mrview.py cmd absolute_file_path")
        print("Supported link commands: ", *SUPPORTED_LINK_CMDS, sep=',')
        sys.exit(1)

    cmd = argv[0]
    file_path = argv[1]

    # read from mrbox.conf
    config = configparser.ConfigParser()
    config.read(config_filepath)

    local_folder = customize_path(config['User']['localPath'], 'mrbox')
    local_path = customize_path(local_folder, file_path)

    if not os.path.exists(local_path):
        print("File does not exist in ", local_folder)
        sys.exit(1)

    # connect to hdfs and create hadoop interface
    hdfs_con = HDFileSystem(host=config['User']['hdfsHost'],
                            port=config['User'].getint('hdfsPort'))
    hadoop_path = config['User']['hadoopPath']
    hadoop = HadoopInterface(hdfs_con, hadoop_path)

    # create sqlite db instance
    full_db_path = os.path.join(config['User']['localPath'],
                                config['User']['dbFile'])
    lc = LocalCatalog(full_db_path)

    # need to query db to get type and remote path if link
    hdfs_path = lc.get_remote_file_path(local_path)
    loc_type = lc.get_loc_type_by_remote_path(hdfs_path)

    # if link, only the supported cmds can be executed on HDFS copy
    # if dir or file, UNIX cmds to be executed locally
    if loc_type == 'link':
        if cmd not in SUPPORTED_LINK_CMDS:
            print(
                cmd,
                " not supported for links.\nTry 'mrview.py help' for more information."
            )
            sys.exit(1)
        elif cmd == 'head':
            hadoop.head(hdfs_path)
        elif cmd == 'tail':
            hadoop.tail(hdfs_path)
    else:
        os.system(cmd + ' ' + local_path)