Exemple #1
0
def clean_empty_dirs(remote_basedir):
    LOGGER = logging.getLogger(__name__)
    deleted_dirs = []
    ## Directory structure is {remote_basedir}/{year}/{month}
    year_dirs = hdfs.ls(remote_basedir)
    # Do an ls to find all month dirs
    for year_dir in year_dirs:
        month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir))
        # Check to see if month dirs are empty
        month_dirs_deleted = 0
        for month_dir in month_dirs:
            files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir,
                                           month_dir))
            if not files:
                LOGGER.debug(
                    "Directory {0} is empty, deleting it".format(month_dir))
                hdfs.rmr(month_dir)
                deleted_dirs.append(month_dir)
                month_dirs_deleted += 1

        if month_dirs_deleted == len(month_dirs):
            # Deleted all month sub-directories, so delete year directory too
            LOGGER.debug(
                "Directory {0} is empty, deleting it".format(year_dir))
            hdfs.rmr(year_dir)
            deleted_dirs.append(year_dir)
    return deleted_dirs
Exemple #2
0
 def move(self):
     for wd in self.local_wd, self.hdfs_wd:
         t1 = self.__make_tree(wd)
         t2 = [_ for _ in t1.children if _.kind == 1][0]
         f2 = t2.children[0]
         hdfs.move(f2.name, t1.name)
         ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)]
         self.assertTrue(os.path.basename(f2.name) in ls)
         self.assertEqual(len(hdfs.ls(t2.name)), 0)
Exemple #3
0
 def move(self):
   for wd in self.local_wd, self.hdfs_wd:
     t1 = self.__make_tree(wd)
     t2 = [_ for _ in t1.children if _.kind == 1][0]
     f2 = t2.children[0]
     hdfs.move(f2.name, t1.name)
     ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)]
     self.assertTrue(os.path.basename(f2.name) in ls)
     self.assertEqual(len(hdfs.ls(t2.name)), 0)
Exemple #4
0
def get_default_fs():
    root_ls = phdfs.ls('/')
    if root_ls:
        uri = Uri(urlparse.urlparse(root_ls[0]))
        return uri
    else:
        raise RuntimeError("Could not determine URI of default file system.  It's empty.")
Exemple #5
0
def server():
    print 'Server...'
    context = zmq.Context()
    socket = context.socket(zmq.REP)
    socket.bind(server_tcp)

    client_list = []
    hdfs_url = '/datasets/corpus/enwiki-11g'
    file_list = hdfs.ls(hdfs_url)
    print len(file_list)

    while True:
        message = socket.recv()
        if message.startswith("connect"):
            client_list.append(message.split(':')[1])
            socket.send("connected")
        elif message.startswith("read"):
            client = message.split(':')[1]
            print client
            if len(file_list) == 0:
                socket.send("done")
                client_list.remove(client)
                if len(client_list) == 0:
                    return
            if client in client_list:
                f = file_list.pop()
                print len(file_list)
                print f
                socket.send_string("file>" + f)
Exemple #6
0
def get_all_wb(model, checkpoint_dir):
    """\
    Get all weights and biases from model checkpoints in checkpoint_dir.

    checkpoint_dir:
      part-m-00000.zip
      part-m-00001.zip
      ...

    return:
      {"00000": W0, "00001": W1, ...}, {"00000": b0, "00001": b1, ...}
    """
    paths = []
    tags = {}
    for p in hdfs.ls(checkpoint_dir):
        m = re.match(r"^part-m-(\d+)\.zip$", hdfs.path.basename(p))
        if m:
            paths.append(p)
            tags[p] = m.groups()[0]
    weights, biases = {}, {}
    for p in paths:
        t = tags[p]
        weights[t], biases[t] = get_wb(model, p)
        LOGGER.info("%s: W %r b %r", p, weights[t].shape, biases[t].shape)
    return weights, biases
    def _get_hopsfs_dataset_files(training_dataset_location, split,
                                  filter_empty):
        path = training_dataset_location.replace("hopsfs", "hdfs")
        if split is None:
            path = hdfs.path.abspath(path)
        else:
            path = hdfs.path.abspath(path + "/" + str(split))

        input_files = []

        all_list = hdfs.ls(path, recursive=True)

        # Remove directories and spark '_SUCCESS'
        include_file = True
        for file in all_list:
            # remove empty file if any
            if filter_empty:
                _file_size = hdfs.path.getsize(file)
                if _file_size == 0:
                    include_file = False
                else:
                    include_file = True
            if (not hdfs.path.isdir(file) and not file.endswith("_SUCCESS")
                    and include_file):
                input_files.append(file)

        return input_files
def server():
    print 'Server...'
    context = zmq.Context()
    socket = context.socket(zmq.REP)
    socket.bind(server_tcp)

    client_list = []
    hdfs_url = '/datasets/corpus/enwiki-11g'
    file_list = hdfs.ls(hdfs_url)
    print len(file_list)

    while True:
        message = socket.recv()
        if message.startswith("connect"):
            client_list.append(message.split(':')[1])
            socket.send("connected")
        elif message.startswith("read"):
            client = message.split(':')[1]
            print client
            if len(file_list) == 0:
                socket.send("done")
                client_list.remove(client)
                if len(client_list) == 0:
                    return
            if client in client_list:
                f = file_list.pop()
                print len(file_list)
                print f
                socket.send_string("file>" + f)
Exemple #9
0
def evaluate():
    prefix = 'frozen_graph'
    # freeze_graph("prob")
    time0 = time.time()
    graph = load_frozen_graph(prefix=prefix)
    with tf.Session(graph=graph) as sess:
        sess.run(graph.get_operation_by_name(prefix + '/init_all_tables'))

        empty_X = {'feat_ids': [], 'feat_vals': []}
        label_list, hdfs_files = [], []

        for xfile in hdfs.ls(hdfs_dir):
            if hdfs.path.isdir(xfile):
                continue
            hdfs_files.append(xfile)

        pred_fp = open('pred', 'w')
        label_fp = open('label', 'w')
        print("Begin inference")
        for i in range(190, len(hdfs_files)):
            train_fp = hdfs.open(hdfs_files[i], 'rt')
            end_of_file = False
            while True:
                X_validate = copy.deepcopy(empty_X)
                read_line_num = 0

                while True:
                    line = train_fp.readline().strip().split(' ')
                    if len(line) != len(_COLUMNS):
                        end_of_file = True
                        break

                    X_validate['feat_ids'].append(list(map(lambda x: [int(x.split(':')[0])], line[1:])))
                    X_validate['feat_vals'].append(list(map(lambda x: [float(x.split(':')[1])], line[1:])))

                    label_list.append(line[0])

                    read_line_num += 1
                    if read_line_num == FLAGS.batch_size:
                        break

                input_feed = dict()
                input_feed[sess.graph.get_tensor_by_name(prefix + "/" + 'IteratorGetNext:0')] = X_validate['feat_ids']
                input_feed[sess.graph.get_tensor_by_name(prefix + "/" + 'IteratorGetNext:1')] = X_validate['feat_vals']

                prob = graph.get_operation_by_name(prefix + "/prob").outputs[-1]
                pred = sess.run(prob, feed_dict=input_feed)
                np.savetxt(pred_fp, pred, delimiter='\n', fmt='%s')

                if end_of_file:
                    break

        label_fp.writelines('\n'.join([str(x) for x in label_list]) + '\n')
        train_fp.close()
        pred_fp.close()
        label_fp.close()
        os.system("paste -d '\t' pred label  > prob_label")
        os.system("python evaluate.py prob_label")
        time1 = time.time()
        print("evaluate cost: ", time1 - time0)
Exemple #10
0
 def mkdir(self):
     for wd in self.local_wd, self.hdfs_wd:
         d1 = "%s/d1" % wd
         d2 = "%s/d2" % d1
         hdfs.mkdir(d2)
         dir_list = hdfs.ls(d1)
         self.assertEqual(len(dir_list), 1)
         self.assertTrue(dir_list[0].endswith(d2))
Exemple #11
0
def list_images(input_dir):
    ret = []
    p = re.compile(r".*\.jpe?g$", re.IGNORECASE)
    ls = [_['name'] for _ in hdfs.lsl(input_dir) if _['kind'] == 'directory']
    for d in ls:
        ret.extend([_ for _ in hdfs.ls(d) if p.match(_)])
    LOGGER.info("%d classes, %d total images", len(ls), len(ret))
    return ret
Exemple #12
0
 def mkdir(self):
   for wd in self.local_wd, self.hdfs_wd:
     d1 = "%s/d1" % wd
     d2 = "%s/d2" % d1
     hdfs.mkdir(d2)
     dir_list = hdfs.ls(d1)
     self.assertEqual(len(dir_list), 1)
     self.assertTrue(dir_list[0].endswith(d2))
def read_csv_from_hdfs(path, cols, col_types=None):
  files = hdfs.ls(path);
  pieces = []
  for f in files:
    fhandle = hdfs.open(f)
    pieces.append(pd.read_csv(fhandle, names=cols, dtype=col_types))
    fhandle.close()
  return pd.concat(pieces, ignore_index=True)
Exemple #14
0
def json_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    for fi in file_lists:
        with hdfs.open(fi, "r") as f:
            items = f.read().strip().split('\n')
            for it in items:
                it = loads(it)
                it['md5'] = hashlib.md5(str(it)).hexdigest()
                yield it
Exemple #15
0
 def check(self, args):
     self.root = args["root"]
     self.hdfs_root = args["hdfs_root"]
     print("checking: %s" % self.root)
     print("checking hdfs: %s" % self.hdfs_root)
     if path.isdir(self.hdfs_root) == False:
         return False
     if os.path.isdir(self.root) == False:
         return False
     return self.walk(self.root, os.listdir(self.root), self.hdfs_root,
                      hdfs.ls(self.hdfs_root))
Exemple #16
0
 def __init__(self, uri, compress=True):
     "ctor with hdfs uri: hdfsio:/path/schema.avsc"
     Storage.__init__(self, uri)
     schema = self.uri
     if  not hdfs.ls(schema):
         raise Exception("No avro schema file found in provided uri: %s" % uri)
     self.hdir = self.uri.rsplit('/', 1)[0]
     if  not hdfs.path.isdir(self.hdir):
         raise Exception('HDFS path %s does not exists' % self.hdir)
     schema_doc = hdfs.load(schema)
     self.schema = avro.schema.parse(schema_doc)
     self.compress = compress
Exemple #17
0
def ls(hdfs_path, recursive=False):
    """
    lists a directory in HDFS

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).

    Returns:
        returns a list of hdfs paths
    """
    hdfs_path = _expand_path(hdfs_path)
    return hdfs.ls(hdfs_path, recursive=recursive)
Exemple #18
0
def read_all_data(file_path='/home/ad/wujindou/text_0908'):
	valid_path = ['part-'+filename.split('part-')[1] for filename in hdfs.ls(file_path) if 'part' in filename ]
	data_all = []
	uniq = set()
	for filename in valid_path:
		with hdfs.open(file_path+'/'+filename) as f:
			for line in f:
				if line.decode() in uniq:continue
				uniq.add(line.decode)
				data_all.append(line.decode())
	import random
	random.shuffle(data_all)
	return data_all
Exemple #19
0
    def read(self, storage_connector, data_format, read_options, location,
             split):
        df_list = []
        if storage_connector.connector_type == storage_connector.HOPSFS:
            # providing more informative error
            try:
                from pydoop import hdfs
            except ImportError as err:
                raise ModuleNotFoundError(
                    "Reading training dataset from HopsFS requires `pydoop`"
                ) from err

            util.setup_pydoop()

            if split is None:
                path_list = hdfs.ls(location, recursive=True)
            else:
                path_list = hdfs.ls(location + "/" + str(split),
                                    recursive=True)

            for path in path_list:
                if (hdfs.path.isfile(path) and not path.endswith("_SUCCESS")
                        and hdfs.path.getsize(path) > 0):
                    if data_format.lower() == "csv":
                        df_tmp = pd.read_csv(path)
                    elif data_format.lower() == "tsv":
                        df_tmp = pd.read_csv(path, sep="\t")
                    elif data_format.lower() == "parquet":
                        df_tmp = pd.read_parquet(path)
                    else:
                        raise TypeError(
                            "{} training dataset format is not supported to read as pandas dataframe. If you are using `tfrecord` use the `.tf_data` helper functions."
                            .format(data_format))
                    df_list.append(df_tmp)
        else:
            raise NotImplementedError(
                "{} Storage Connectors for training datasets are not supported yet for external environments."
                .format(storage_connector.connector_type))
        return pd.concat(df_list, ignore_index=True)
Exemple #20
0
 def __init__(self, uri, compress=True):
     "ctor with hdfs uri: hdfsio:/path/schema.avsc"
     Storage.__init__(self, uri)
     schema = self.uri
     if not hdfs.ls(schema):
         raise Exception("No avro schema file found in provided uri: %s" %
                         uri)
     self.hdir = self.uri.rsplit('/', 1)[0]
     if not hdfs.path.isdir(self.hdir):
         raise Exception('HDFS path %s does not exists' % self.hdir)
     schema_doc = hdfs.load(schema)
     self.schema = avro.schema.parse(schema_doc)
     self.compress = compress
Exemple #21
0
def ls(hdfs_path, recursive=False, exclude_nn_addr=False):
    """
    lists a directory in HDFS

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).

    Returns:
        returns a list of hdfs paths
    """
    if exclude_nn_addr:
        hdfs_path = re.sub(r"\d+.\d+.\d+.\d+:\d+", "", hdfs_path)
    hdfs_path = _expand_path(hdfs_path)
    return hdfs.ls(hdfs_path, recursive=recursive)
def clean_empty_dirs(remote_basedir):
    LOGGER = logging.getLogger(__name__)
    deleted_dirs = []
    ## Directory structure is {remote_basedir}/{year}/{month}
    year_dirs = hdfs.ls(remote_basedir)
    # Do an ls to find all month dirs
    for year_dir in year_dirs:
        month_dirs = hdfs.ls(hdfs.path.join(remote_basedir, year_dir))
        # Check to see if month dirs are empty
        month_dirs_deleted = 0
        for month_dir in month_dirs:
            files = hdfs.ls(hdfs.path.join(remote_basedir, year_dir, month_dir))
            if not files:
                LOGGER.debug("Directory {0} is empty, deleting it".format(month_dir))
                hdfs.rmr(month_dir)
                deleted_dirs.append(month_dir)
                month_dirs_deleted += 1

        if month_dirs_deleted == len(month_dirs):
            # Deleted all month sub-directories, so delete year directory too
            LOGGER.debug("Directory {0} is empty, deleting it".format(year_dir))
            hdfs.rmr(year_dir)
            deleted_dirs.append(year_dir)
    return deleted_dirs
Exemple #23
0
 def __init__(self, uri, wmauri, yarn=''):
     "ctor with LTS uri (hdfs:///path/schema.avsc) and WMArchive uri"
     self.uri = uri
     if  not hdfs.ls(self.uri):
         raise Exception("No avro schema file found in provided uri: %s" % uri)
     self.hdir = self.uri.rsplit('/', 1)[0]
     if  not hdfs.path.isdir(self.hdir):
         raise Exception('HDFS path %s does not exists' % self.hdir)
     schema_doc = hdfs.load(self.uri)
     self.schema = avro.schema.parse(schema_doc)
     self.taskmgr = TaskManager()
     self.wmauri = wmauri # WMArchive URL which will be used by submit
     if  not self.wmauri.endswith('/wmarchive/data'):
         self.wmauri = '%s/wmarchive/data' % self.wmauri
     self.yarn = yarn
Exemple #24
0
def ls(hdfs_path, recursive=False, project=None):
    """
    Returns all the pathnames in the supplied directory.

    Args:
        :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS).
        :recursive: if it is a directory and recursive is True, the list contains one item for every file or directory in the tree rooted at hdfs_path.
        :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS.

    Returns:
      A possibly-empty list of path names stored in the supplied path.
    """
    if project == None:
        project = project_name()
    hdfs_path = _expand_path(hdfs_path, project)
    return hdfs.ls(hdfs_path, recursive=recursive)
Exemple #25
0
 def __init__(self, uri, wmauri, yarn=''):
     "ctor with LTS uri (hdfs:///path/schema.avsc) and WMArchive uri"
     self.uri = uri
     if not hdfs.ls(self.uri):
         raise Exception("No avro schema file found in provided uri: %s" %
                         uri)
     self.hdir = self.uri.rsplit('/', 1)[0]
     if not hdfs.path.isdir(self.hdir):
         raise Exception('HDFS path %s does not exists' % self.hdir)
     schema_doc = hdfs.load(self.uri)
     self.schema = avro.schema.parse(schema_doc)
     self.taskmgr = TaskManager()
     self.wmauri = wmauri  # WMArchive URL which will be used by submit
     if not self.wmauri.endswith('/wmarchive/data'):
         self.wmauri = '%s/wmarchive/data' % self.wmauri
     self.yarn = yarn
Exemple #26
0
def run_mr_app(args, logger):
    logger.debug("local LIBHDFS_OPTS: %r" % (os.getenv("LIBHDFS_OPTS"), ))
    logger.info("running MapReduce application")
    mr_out_dir = run_phase_one(args, logger)
    for nm in args.mappers[1:]:
        input_ = random_str(args)
        logger.info("running consolidation step, input=%r" % (input_, ))
        with hdfs.open(input_, "w", user=args.hdfs_user) as fo:
            ls = [
                _ for _ in hdfs.ls(mr_out_dir, user=args.hdfs_user)
                if hdfs.path.basename(_).startswith("part")
            ]
            logger.debug("found %d data files in %r" % (len(ls), mr_out_dir))
            for fn in ls:
                fo.write("%s\n" % hdfs.path.abspath(fn, user=args.hdfs_user))
        mr_out_dir = run_phase_two(args, nm, input_, logger)
    return mr_out_dir
def _get_hopsfs_dataset_files(training_dataset_location, split):
    path = training_dataset_location.replace("hopsfs", "hdfs")
    if split is None:
        path = hdfs.path.abspath(path)
    else:
        path = hdfs.path.abspath(path + "/" + str(split))

    input_files = []

    all_list = hdfs.ls(path, recursive=True)

    # Remove directories and spark '_SUCCESS' file if any
    for file in all_list:
        if not hdfs.path.isdir(file) and not file.endswith("_SUCCESS"):
            input_files.append(file)

    return input_files
Exemple #28
0
def main(argv=sys.argv):
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    parser = make_parser()
    args = parser.parse_args(argv[1:])
    logging.basicConfig()
    LOGGER.setLevel(args.log_level)
    paths = None
    if hdfs.path.isfile(args.stats_path):
        paths = [args.stats_path]
    else:
        try:
            ls = hdfs.ls(args.stats_path)
        except IOError as e:
            return "ERROR: %s: %s" % (args.stats_path, e)
        paths = [
            _ for _ in ls if re.match(r"^part-m-\d+$", hdfs.path.basename(_))
        ]
    gen_plots(paths, args.out_dir)
Exemple #29
0
def collect_output(mr_out_dir, logger):
    builder = None
    for fn in hdfs.ls(mr_out_dir):
        if not hdfs.path.basename(fn).startswith("part"):
            continue
        logger.info("processing %r" % (fn, ))
        with hdfs.open(fn) as f:
            s = zlib.decompress(f.read())
        if s == "":
            continue
        vectors = KinshipVectors.deserialize(
            s)  # ignores trailing newline char
        if builder is None:
            builder = KinshipBuilder(vectors)
        else:
            builder.vectors += vectors
    logger.info("building kinship matrix")
    return builder.build()
Exemple #30
0
def collate_mapred_output(output_dir):
    data = {"weights": {}, "biases": {}}
    pattern = re.compile(r"part-m-\d+-(\d+)-(weights|biases).npz")
    for path in hdfs.ls(output_dir):
        LOGGER.debug("processing: %s", path)
        m = pattern.match(hdfs.path.basename(path))
        if not m:
            continue
        seed, what = m.groups()
        with hdfs.open(path, "rb") as f:
            npzf = np.load(f)
            data[what].update(
                {"%s_%s" % (seed, t): w
                 for (t, w) in npzf.iteritems()})
    for k, v in data.items():
        out_path = hdfs.path.join(output_dir, "%s.npz" % k)
        LOGGER.info("saving collated %s to %s", k, out_path)
        with hdfs.open(out_path, "wb") as f:
            np.savez(f, **v)
    def _read_hopsfs(self, location, data_format):
        # providing more informative error
        try:
            from pydoop import hdfs
        except ModuleNotFoundError:
            return self._read_hopsfs_rest(location, data_format)

        util.setup_pydoop()
        path_list = hdfs.ls(location, recursive=True)

        df_list = []
        for path in path_list:
            if (
                hdfs.path.isfile(path)
                and not path.endswith("_SUCCESS")
                and hdfs.path.getsize(path) > 0
            ):
                df_list.append(self._read_pandas(data_format, path))
        return df_list
Exemple #32
0
    def walk(self, parent_path, file_list, hdfs_parent_path, hdfs_file_list):
        print("%s %s" % (parent_path, hdfs_parent_path))
        if len(file_list) == 0 and len(hdfs_file_list) == 0:
            if os.path.basename(parent_path) == path.basename(
                    hdfs_parent_path):
                return True
            return False
        elif len(file_list) != len(hdfs_file_list):
            print("No match: number of files in dirs")
            return False
        else:
            file_list.sort(
                key=lambda f: os.path.isfile(os.path.join(parent_path, f)))
            hdfs_file_list.sort(
                key=lambda f: path.isfile(path.join(hdfs_parent_path, f)))
            hIdx = 0
            for idx, sub_path in enumerate(file_list):
                full_path = os.path.join(parent_path, sub_path)
                hdfs_sub_path = hdfs_file_list[idx]
                hdfs_full_path = path.join(hdfs_parent_path, hdfs_sub_path)

                if (os.path.basename(sub_path) !=
                        path.basename(hdfs_sub_path)):
                    print("No match: %s and %s" % (sub_path, hdfs_sub_path))
                    return False

                if os.path.isdir(full_path):
                    if path.isdir(hdfs_full_path) == False:
                        print("No match on directory: %s and %s" %
                              (full_path, hdfs_full_path))
                        return False
                    return self.walk(full_path, os.listdir(full_path),
                                     hdfs_full_path, hdfs.ls(hdfs_full_path))
                elif os.path.isfile(full_path):
                    sz = os.path.getsize(full_path)
                    hdfs_size = path.getsize(hdfs_full_path)
                    if (hdfs_size != sz):
                        return False

        return True
Exemple #33
0
def write_output(names, fin, fout):

    # 		# in similarity search when no results are found, results file is not present
    # 		# so create an empty local file to indicate no results found
    # 		if os.path.isdir(fin) == False:
    # 			fd = open(fout, "w")
    # 			fd.close()
    # 			return

    files = hdfs.ls(fin)
    # find file that has "part-" in the filename; it is the result
    for f in files:
        if "part-" in f:
            break

    with hdfs.open(f) as fd:

        result = pd.read_csv(fd,
                             sep='\t',
                             header=None,
                             names=["id", "id 2", "Euclidean Distance"])
        result = result.merge(names, on="id", how='inner')

        result.rename(columns={
            'name': 'Entity 1',
            'id': 'id 1',
            'id 2': 'id'
        },
                      inplace=True)

        result = result.merge(names, on="id", how='inner')

        result.rename(columns={'name': 'Entity 2', 'id': 'id 2'}, inplace=True)
        del result['id 1']
        del result['id 2']

        result = result.sort_values(by=["Euclidean Distance"])
        result[['Entity 1', 'Entity 2',
                'Euclidean Distance']].to_csv(fout, index=False, sep='\t')
    def rename_compressed_files(self, file_table):
        # find the extension
        output_files = hdfs.ls(self.output_path)
        if len(output_files) == 0:
            return

        compressor_extension = self.get_compressor_extension(output_files)
        self.log.debug("compressor extension is %s", compressor_extension)

        hdfs_host, hdfs_port, _ = hdfs.path.split(output_files[0])
        if hdfs_host == '':
            is_local_fs = True
        else:
            is_local_fs = False
            output_hdfs = hdfs.hdfs(hdfs_host, hdfs_port)

        file_table.seek(0)
        for mapid, line in enumerate(file_table.xreadlines()):
            _, _, relative_output_name = line.rstrip('\n').split('\t')
            # we expect the map task ids to be assigned in the same order as the input
            # file list, so we can match the input file to an output file by its position
            # in the input file list.
            hadoop_output = os.path.join(self.output_path, "part-%05d" % mapid) + compressor_extension
            desired_file_name = os.path.join(self.output_path, relative_output_name) + compressor_extension
            if hadoop_output != desired_file_name:
                self.log.debug("renaming %s to %s", hadoop_output, desired_file_name)
                if is_local_fs:
                    # Though we could transparently use hdfs.move for both local fs and hdfs,
                    # using native methods for the local fs should be faster.
                    # os.renames automatically creates necessary parent directories for destination.
                    os.renames(urlparse(hadoop_output).path, urlparse(desired_file_name).path)
                else:
                    # create the output subdirectory, if necessary
                    dirname = os.path.dirname(relative_output_name)
                    if dirname:
                        output_hdfs.create_directory( os.path.join(self.output_path, dirname) )
                    if output_hdfs.exists(desired_file_name):
                        raise RuntimeError("Can't overwrite file in output directory: %s" % desired_file_name)
                    output_hdfs.move(hadoop_output, output_hdfs, desired_file_name)
Exemple #35
0
def readText(filePath = ""):
  import pydoop.hdfs as hdfs
  import os
  file_is_file = hdfs.path.isfile(filePath)
  file_is_dir = hdfs.path.isdir(filePath)
  file_exist = hdfs.path.exists(filePath)
  try:
    if(file_is_file):
      files = hdfs.open(filePath)
    elif(file_is_dir):
      files = []
      for pieceFile in hdfs.ls(filePath):
        files += hdfs.open(pieceFile)
    elif(not file_exist):
      if(os.path.exists(filePath)):
        print "[WARN] file not found on hdfs read local file."
        files = open(filePath)
  except Exception as e:
    raise e
  finally:
    print type(files)
  return files
def main():
    # this is hdfs directory
    src_dir = str(sys.argv[1])
    dst_dir = str(sys.argv[2])

    # create dst_dir if not exist
    if not pyhdfs.path.exists(dst_dir):
        pyhdfs.mkdir(dst_dir)

    # create sparkcontext
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext

    # create children path rdd
    children_paths = pyhdfs.ls(src_dir)
    children_paths_rdd = sc.parallelize(children_paths, len(children_paths))

    # each executor task is to copy one children path
    children_paths_rdd.foreach(lambda file_path: copy_file(
        file_path, os.path.join(dst_dir, os.path.basename(file_path))))

    # stop sparkcontext
    sc.stop()
Exemple #37
0
def glob(hdfs_path, recursive=False, project=None):
    """ 
    Finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order.

    Globbing gives you the list of files in a dir that matches a supplied pattern

    >>> import glob
    >>> glob.glob('./[0-9].*')
    >>> ['./1.gif', './2.txt']

    glob is implemented as  os.listdir() and fnmatch.fnmatch()
    We implement glob as hdfs.ls() and fnmatch.filter()

    Args:
     :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS.
     :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS.

    Raises:
        IOError if the supplied hdfs path does not exist

    Returns:
      A possibly-empty list of path names that match pathname, which must be a string containing a path specification. pathname can be either absolute
    """

    # Get the full path to the dir for the input glob pattern
    # "hdfs://Projects/jim/blah/*.jpg" => "hdfs://Projects/jim/blah"
    # Then, ls on 'hdfs://Projects/jim/blah', then filter out results
    if project == None:
        project = project_name()
    lastSep = hdfs_path.rfind("/")
    inputDir = hdfs_path[:lastSep]
    inputDir = _expand_path(inputDir, project)
    pattern = hdfs_path[lastSep + 1:]
    if not hdfs.path.exists(inputDir):
        raise IOError("Glob path %s not found" % inputDir)
    dirContents = hdfs.ls(inputDir, recursive=recursive)
    return fnmatch.filter(dirContents, pattern)
Exemple #38
0
def xml_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    #for fi in file_lists:
    for i in xrange(0, 1):
        fi = '/datasets/corpus/enwiki-11g/wiki_912'
        with hdfs.open(fi, "r") as f:
            lines = f.read().strip().split('\n')
            docs, doc = [], None
            for line in lines:
                if line.startswith('<doc'):
                    doc = line
                elif line.startswith('</doc>'):
                    docs.append(doc + line)
                else:
                    #line = line.replace('&', '').replace('"', "'")
                    doc += line.replace('"', "'")

            for doc in docs:
                dom = bs(doc).find('doc')
                doc = dom.attrs
                doc['content'] = dom.text
                doc['md5'] = hashlib.md5(str(doc)).hexdigest()
                yield doc
Exemple #39
0
def _poly_ls(path, *args, **kwargs):
    if path.startswith('hdfs:'):
        return hdfs.ls(path, *args, **kwargs)
    else:
        return os.listdir(path)
Exemple #40
0
def iter_mr_out_files(mr_out_dir):
    for fn in hdfs.ls(mr_out_dir):
        if hdfs.path.basename(fn).startswith("part"):
            yield fn
Exemple #41
0
 def rmr(self):
   for wd in self.local_wd, self.hdfs_wd:
     t1 = self.__make_tree(wd)
     hdfs.rmr(t1.name)
     self.assertEqual(len(hdfs.ls(wd)), 0)
import glob
from elasticsearch import Elasticsearch
import urllib2
import re
import pydoop.hdfs as hdfs


EsHost = {
    "host" : "localhost", 
    "port" : 9200
}

HDFSfiles=[]
for hdFiles in hdfs.ls("/gaana/gaanaLyrics"): #"gaanaLyrics/gaanaLyrics"):
	HDFSfiles.append(hdFiles[41:])


fileNames = []

indexName = 'music'
typeName = 'songs'
#IdField = 'songID'


bulkData = [] 

i = 1
for name in HDFSfiles:	
	dataDict = {}
       	fopen=hdfs.open("/gaana/gaanaLyrics/"+name)
	header = fopen.read()
Exemple #43
0
def mrjob(options):
    "Generates and executes MR job script"

    user = os.getenv('USER')
    tstamp = int(time.time())
    hdir = hdfs_dir(options.hdir, options.hdfs_prefix)

    if  PYDOOP:
        odir = hdfs.path.join(hdir, options.odir)
        idir = hdfs.path.join(hdir, options.idir)
        schema = hdfs.path.join(hdir, options.schema)
        for name in [hdir, odir, idir,]:
            if  options.verbose:
                print("Checking %s" % name)
            if  not hdfs.path.isdir(name):
                if name in [hdir, idir]:
                    print("ERROR: %s does not exist" % name)
                    sys.exit(1)
                # else:
                #     print(" Creating output directory: %s" % name)
                #     hdfs.mkdir(name)
            elif name == odir:
                # in case odir exists and is not empty, move it somewhere and re-create
                if hdfs.ls(odir):
                    ocache = hdfs.path.normpath(odir)+'_%d'%tstamp
                    if options.verbose:
                        print(" Non-empty output directory exists, saving it in %s"%ocache)
                    hdfs.move(odir, ocache)
                    # hdfs.mkdir(odir)
                # if it's empty, remove it
                else:
                    hdfs.rmr(odir)

        if  options.verbose:
            print("Checking %s" % schema)
        if  not hdfs.path.isfile(schema):
            print("ERROR: %s does not exist" % schema)
            sys.exit(1)
    else:
        idir = '%s%s' % (hdir, 'data')
        odir = '%s%s' % (hdir, 'mrout')
        schema = '%s%s' % (hdir, options.schema)
        if  options.verbose:
            msg = 'pydoop module is not present on this system'
            msg += ', will use input as is without checking'
            print('WARNING:', msg)
    for name in [options.mrpy, options.pydoop, options.avro]:
        if  options.verbose:
            print("Checking %s" % name)
        if  not os.path.isfile(name):
            print("ERROR: %s does not exist" % name)
            sys.exit(1)

#     module = os.path.basename(os.path.splitext(options.mrpy)[0])
    code = create_mrpy(options.mrpy, options.verbose)

    cmd = """#!/bin/bash
input={input}
output={output}
schema={schema}
ifile=/tmp/mr_{user}_{tstamp}.py
cat << EOF > $ifile
{code}
EOF

module=mr_{user}_{tstamp}
arch_pydoop={pydoop}
arch_avro={avro}
echo "Input URI : $input"
echo "Output URI: $output"
echo "Schema: $schema"
echo "MR script : $ifile"
echo "Module name : $module"
echo "Pydoop archive: $arch_pydoop"
echo "Avro archive  : $arch_avro"
echo "-----------------"
echo "Submitting MR job"
pydoop submit \
    --upload-archive-to-cache $arch_pydoop \
    --upload-archive-to-cache $arch_avro \
    -D avro.schema=$schema \
    --do-not-use-java-record-reader \
    --log-level {loglevel} \
    --job-name WMArchive \
    --num-reducers 1 \
    --upload-file-to-cache $ifile \
    --mrv2 $module $input $output
    """.format(input=idir, output=odir, user=user, tstamp=tstamp,
               code=code, schema=schema, loglevel=options.loglevel,
               pydoop=os.path.abspath(options.pydoop),
               avro=os.path.abspath(options.avro))

    fobj = NamedTemporaryFile(delete=False)
    fobj.write(cmd)
    fobj.close()

    fstat = os.stat(fobj.name)
    os.chmod(fobj.name, fstat.st_mode | stat.S_IEXEC)

    if  options.execute:
        run(fobj.name, options.verbose)
    else:
        if  options.verbose:
            print("------- Generated script --------")
        print(open(fobj.name, 'r').read())
        if  options.verbose:
            print("---------------------------------")

    # clean up temporary file
    os.unlink(fobj.name)