コード例 #1
0
ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop
 def tearDown(self):
   fs = hdfs.hdfs("", 0)
   fs.delete(self.local_wd)
   fs.close()
   fs = hdfs.hdfs("default", 0)
   fs.delete(self.hdfs_wd)
   fs.close()
コード例 #2
0
def write(writeFlag):
    if (writeFlag == True):
        # instantiate hadoop
        hdfs.hdfs()
        
        targetPath = config.targetPath;
        targetDirectory = config.targetDirectory;
        sourceFile = config.sourceFile;
        
        print("Target Path: " + targetPath);
        print("Target Directory: " + targetDirectory);
        print("Source Path: " + sourceFile);
        
        dumpFile = open(sourceFile, "r");
        fullText = dumpFile.read();
        dumpFile.close();
        
        # write to hadoop
        #hdfs.mkdir(targetDirectory)
        hdfs.dump(fullText, targetPath)
#hdfs.cp(sourceFile, targetPath)

#print (hdfs.ls("test4"))
#files = hdfs.ls("test4")

# read from hadoop
#hdfs.get("test4/hello.txt", "/tmp/hello.txt")
#with open("/tmp/hello.txt") as f:
#	print f.read()

#print(hdfs.ls("test", "hduser1"))
#text = hdfs.load("test/hello.txt")
#print text
コード例 #3
0
 def tearDown(self):
     fs = hdfs.hdfs("", 0)
     fs.delete(self.local_wd)
     fs.close()
     fs = hdfs.hdfs("default", 0)
     fs.delete(self.hdfs_wd)
     fs.close()
コード例 #4
0
 def capacity(self):
     fs = hdfs.hdfs("", 0)
     self.assertRaises(RuntimeError, fs.capacity)
     fs.close()
     if not hdfs.default_is_local():
         fs = hdfs.hdfs("default", 0)
         cap = fs.capacity()
         self.assertGreaterEqual(cap, 0)
コード例 #5
0
ファイル: test_hdfs.py プロジェクト: kikkomep/pydoop
 def capacity(self):
     fs = hdfs.hdfs("", 0)
     self.assertRaises(RuntimeError, fs.capacity)
     fs.close()
     if not hdfs.default_is_local():
         fs = hdfs.hdfs("default", 0)
         cap = fs.capacity()
         self.assertGreaterEqual(cap, 0)
コード例 #6
0
ファイル: test_hdfs_fs.py プロジェクト: jkahn/pydoop-code
 def cache(self):
     orig_fs = hdfs.hdfs(*self.hp_cases[0])
     for host, port in self.hp_cases[1:]:
         fs = hdfs.hdfs(host, port)
         self.assertTrue(fs.fs is orig_fs.fs)
         fs.close()
         self.assertFalse(orig_fs.closed)
     orig_fs.close()
     self.assertTrue(orig_fs.closed)
コード例 #7
0
ファイル: test_hdfs_fs.py プロジェクト: onlynone/pydoop
 def cache(self):
   hdfs.hdfs._CACHE.clear()
   orig_fs = hdfs.hdfs(*self.hp_cases[0])
   for host, port in self.hp_cases[1:]:
     fs = hdfs.hdfs(host, port)
     self.assertTrue(fs.fs is orig_fs.fs)
     fs.close()
     self.assertFalse(orig_fs.closed)
   orig_fs.close()
   self.assertTrue(orig_fs.closed)
コード例 #8
0
ファイル: hadoop_run.py プロジェクト: CosteaPaul/bcbb
def _hdfs_filesystem():
    """Retrieve references to the local and HDFS file system.

    Need to be able to specify host/port. For now, works off defaults.
    """
    fs = hdfs("default", 0)
    lfs = hdfs("", 0)
    try:
        yield fs, lfs
    finally:
        fs.close()
        lfs.close()
コード例 #9
0
 def cache(self):
     for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2):
         hdfs.hdfs._CACHE.clear()
         hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}}  # FIXME
         with hdfs.hdfs(h1, p1) as fs1:
             with hdfs.hdfs(h2, p2) as fs2:
                 print ' * %r vs %r' % ((h1, p1), (h2, p2))
                 self.assertTrue(fs2.fs is fs1.fs)
             for fs in fs1, fs2:
                 self.assertFalse(fs.closed)
         for fs in fs1, fs2:
             self.assertTrue(fs.closed)
コード例 #10
0
 def cache(self):
     for (h1, p1), (h2, p2) in product(self.hp_cases, repeat=2):
         hdfs.hdfs._CACHE.clear()
         hdfs.hdfs._ALIASES = {"host": {}, "port": {}, "user": {}}  # FIXME
         with hdfs.hdfs(h1, p1) as fs1:
             with hdfs.hdfs(h2, p2) as fs2:
                 print ' * %r vs %r' % ((h1, p1), (h2, p2))
                 self.assertTrue(fs2.fs is fs1.fs)
             for fs in fs1, fs2:
                 self.assertFalse(fs.closed)
         for fs in fs1, fs2:
             self.assertTrue(fs.closed)
コード例 #11
0
def copyFileToHDFSFolder(localpath, hdfspath):
    """
    Copies a file from a local or HDFS to an HDFS location
    :param localpath: path to local file
    :param hdfspath: path to target file on HDFS
    :return: None
    """
    if localpath.startswith('file:/'):
        lf = H.hdfs("", 0)
    else:
        lf = H.hdfs()
    h = H.hdfs()
    lf.copy(localpath, h, hdfspath)
コード例 #12
0
def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())
コード例 #13
0
def save_checkpoint(path, session=None):
    if session is None:
        session = tf.get_default_session()
    if session is None:
        raise RuntimeError("no session specified and no current session")
    saver = tf.train.Saver()
    wd = tempfile.mkdtemp(prefix="pydeep_")
    sub_d = hdfs.path.splitext(hdfs.path.basename(path))[0]
    abs_d = os.path.join(wd, sub_d)
    os.makedirs(abs_d)
    saver.save(session, os.path.join(abs_d, Model.CHECKPOINT_NAME))
    zip_fn = "%s.zip" % abs_d
    shutil.make_archive(*zip_fn.rsplit(".", 1), root_dir=abs_d)
    with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs:
        local_fs.copy(zip_fn, fs, path)
コード例 #14
0
 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)
コード例 #15
0
ファイル: seqal_run.py プロジェクト: pinno/seal
    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug("Properties:\n%s", "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k,v in self.properties.iteritems() ]) ))
        self.logger.info("Input: %s; Output: %s; reference: %s", self.options.input, self.options.output, self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.', suffix=str(random.random()), dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(self.remote_bin_name)['name']

                return seal_utilities.run_pipes(full_name, self.options.input, self.options.output,
                    properties=self.properties, args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(self.remote_bin_name) # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted", self.remote_bin_name)
                except:
                    self.logger.error("Error deleting the temporary pipes script %s from HDFS", self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")
コード例 #16
0
ファイル: test_hdfs_fs.py プロジェクト: onlynone/pydoop
 def connect(self):
   for host, port in self.hp_cases:
     for user in self.u_cases:
       expected_user = user or CURRENT_USER
       fs = hdfs.hdfs(host, port, user=user)
       self.assertEqual(fs.user, expected_user)
       fs.close()
コード例 #17
0
    def build_map(self, top_dir):
        """\
        For each subdir (corresponding to an image class), build the full
        list of (filename, offset) pair where each bottleneck dump can be
        retrieved.

        {'dandelion': [
            ('part-m-00000', 0),
            ('part-m-00000', 8192),
            ...
            ('part-m-00003', 163840)
        ],
        'roses': [
            ('part-m-00000', 0),
            ...
        ]}
        """
        m = {}
        basename = hdfs.path.basename
        with hdfs.hdfs() as fs:
            for stat in fs.list_directory(top_dir):
                if stat['kind'] != 'directory':
                    continue
                subd = stat['name']
                positions = []
                for s in fs.list_directory(subd):
                    bname = basename(s["name"])
                    if bname.startswith("_"):
                        continue
                    assert s["size"] % self.record_size == 0
                    for i in range(0, s["size"], self.record_size):
                        positions.append((bname, i))
                m[basename(subd)] = positions
        return m
コード例 #18
0
 def __create_data_file(self):
     host, port, path = split_hdfs_path(self.data_file_name)
     fs = hdfs(host, port)
     f = fs.open_file(path, os.O_WRONLY, 0, 0, 0)
     f.write(self.f.getvalue())
     f.close()
     fs.close()
コード例 #19
0
class StorageHandler:

    hdfsobj = hdfs.hdfs()

    def __init__(self, host, port):
        self.hdfsobj = hdfs.hdfs(host,
                                 port,
                                 user="******",
                                 groups=["vagrant"])

    def pwd(self):
        return self.hdfsobj.working_directory()

    def listDirectory(self, path="/"):
        return self.hdfsobj.list_directory(path)

    def delete(self, path):
        self.hdfsobj.delete(path, False)

    def put(self, source, destination):
        hdfs.put(source, destination)

    def copyFile(self, source, destination):
        self.hdfsobj.copy(source, self.hdfsobj, destination)

    def write(self, path, mod, data):
        with hdfs.open(path, mod) as f:
            f.write(data)
コード例 #20
0
ファイル: db.py プロジェクト: AshinGau/eventdb
 def __init__(self, tableName, host='192.168.60.64', infoTable='runInfo'):
     self.tableName = tableName
     self.conn = happybase.Connection(host)
     self.table = self.conn.table(infoTable)
     self.eventdb = self.conn.table('HEP:' + tableName)
     self.escape = escape()
     self.fs = hdfs.hdfs(host=host, port=8022, user='******')
コード例 #21
0
ファイル: test_hdfs_fs.py プロジェクト: kmatzen/pydoop
 def connect(self):
     for host, port in self.hp_cases:
         for user in self.u_cases:
             expected_user = user or CURRENT_USER
             fs = hdfs.hdfs(host, port, user=user)
             self.assertEqual(fs.user, expected_user)
             fs.close()
コード例 #22
0
ファイル: test_path.py プロジェクト: ilveroluca/pydoop
 def setUp(self):
   if hdfs.default_is_local():
     self.root = "file:"
   else:
     fs = hdfs.hdfs("default", 0)
     self.root = "hdfs://%s:%s" % (fs.host, fs.port)
     fs.close()
コード例 #23
0
ファイル: script.py プロジェクト: ilveroluca/pydoop
  def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path):
    """
    Check directories above the remote module and issue a warning if
    they are not traversable by all users.

    The reasoning behind this is mainly aimed at set-ups with a centralized
    Hadoop cluster, accessed by all users, and where the Hadoop task tracker
    user is not a superuser; an example may be if you're running a shared
    Hadoop without HDFS (using only a POSIX shared file system).  The task
    tracker correctly changes user to the job requester's user for most
    operations, but not when initializing the distributed cache, so jobs who
    want to place files not accessible by the Hadoop user into dist cache fail.
    """
    host, port, path = hdfs.path.split(abs_remote_path)
    if host == '' and port == 0: # local file system
      host_port = "file:///"
    else:
      # FIXME: this won't work with any scheme other than hdfs:// (e.g., s3)
      host_port = "hdfs://%s:%s/" % (host, port)
    path_pieces = path.strip('/').split(os.path.sep)
    fs = hdfs.hdfs(host, port)
    for i in xrange(0, len(path_pieces)):
      part = os.path.join(host_port, os.path.sep.join(path_pieces[0:i+1]))
      permissions = fs.get_path_info(part)['permissions']
      if permissions & 0111 != 0111:
        self.logger.warning(
          "the remote module %s may not be readable\n" +
          "by the task tracker when initializing the distributed cache.\n" +
          "Permissions on path %s: %s", abs_remote_path, part, oct(permissions))
        break
コード例 #24
0
ファイル: test_path.py プロジェクト: onlynone/pydoop
 def setUp(self):
   if hdfs.default_is_local():
     self.root = "file:"
   else:
     fs = hdfs.hdfs("default", 0)
     self.root = "hdfs://%s:%s" % (fs.host, fs.port)
     fs.close()
コード例 #25
0
ファイル: hdfs.py プロジェクト: nihil0/hops-util-py
def get():
    """ Get a handle to pydoop hdfs using the default namenode (specified in hadoop config)

    Returns:
        Pydoop hdfs handle
    """
    return hdfs.hdfs('default', 0, user=project_user())
コード例 #26
0
ファイル: submit.py プロジェクト: muhammadyaseen/pydoop
    def __warn_user_if_wd_maybe_unreadable(self, abs_remote_path):
        """
        Check directories above the remote module and issue a warning if
        they are not traversable by all users.

        The reasoning behind this is mainly aimed at set-ups with a
        centralized Hadoop cluster, accessed by all users, and where
        the Hadoop task tracker user is not a superuser; an example
        may be if you're running a shared Hadoop without HDFS (using
        only a POSIX shared file system).  The task tracker correctly
        changes user to the job requester's user for most operations,
        but not when initializing the distributed cache, so jobs who
        want to place files not accessible by the Hadoop user into
        dist cache fail.
        """
        host, port, path = hdfs.path.split(abs_remote_path)
        if host == '' and port == 0:  # local file system
            host_port = "file:///"
        else:
            # FIXME: this won't work with any scheme other than
            # hdfs:// (e.g., s3)
            host_port = "hdfs://%s:%s/" % (host, port)
        path_pieces = path.strip('/').split(os.path.sep)
        fs = hdfs.hdfs(host, port)
        for i in range(0, len(path_pieces)):
            part = os.path.join(host_port,
                                os.path.sep.join(path_pieces[0:i + 1]))
            permissions = fs.get_path_info(part)['permissions']
            if permissions & 0o111 != 0o111:
                self.logger.warning(
                    ("remote module %s may not be readable by the task "
                     "tracker when initializing the distributed cache.  "
                     "Permissions on %s: %s"), abs_remote_path, part,
                    oct(permissions))
                break
コード例 #27
0
ファイル: test_path.py プロジェクト: kikkomep/pydoop
 def stat_on_local(self):
     wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR)
     p_ = os.path.join(wd_, make_random_str())
     if hdfs.default_is_local():
         wd, p = wd_, p_
         host = "default"
     else:
         wd, p = ('file:%s' % _ for _ in (wd_, p_))
         host = ""
     fs = hdfs.hdfs(host, 0)
     with fs.open_file(p_, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(p_)
     fs.close()
     s = hdfs.path.stat(p)
     os_s = os.stat(p_)
     for n in dir(s):
         if n.startswith('st_'):
             try:
                 exp_v = getattr(os_s, n)
             except AttributeError:
                 try:
                     exp_v = info[self.NMAP[n]]
                 except KeyError:
                     continue
                 self.assertEqual(getattr(s, n), exp_v)
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(wd)
コード例 #28
0
def load_checkpoint(path, session=None):
    if session is None:
        session = tf.get_default_session()
    if session is None:
        raise RuntimeError("no session specified and no current session")
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, hdfs.path.basename(path))
    with hdfs.hdfs() as fs, hdfs.hdfs("", 0) as local_fs:
        fs.copy(path, local_fs, zip_fn)
    unpack_dir = os.path.splitext(zip_fn)[0]
    shutil.unpack_archive(zip_fn, unpack_dir)
    ckpt_path = os.path.join(unpack_dir, Model.CHECKPOINT_NAME)
    metagraph_path = "%s.meta" % ckpt_path
    if not os.path.isfile(metagraph_path):
        raise RuntimeError("checkpoint files not found in %s" % zip_fn)
    saver = tf.train.import_meta_graph(metagraph_path)
    saver.restore(session, ckpt_path)
コード例 #29
0
 def __init__(self, context):
     super(WholeFileReader, self).__init__(context)
     self.logger = LOGGER.getChild("WholeFileReader")
     raw_split = context.get_input_split(raw=True)
     self.isplit = OpaqueInputSplit().read(io.BytesIO(raw_split))
     self.paths = self.isplit.payload
     self.n_paths = len(self.paths)
     self.fs = hdfs.hdfs()
コード例 #30
0
ファイル: parquet.py プロジェクト: BigUtrecht/BigUtrecht
def clearLocatie():
    """
    Removes the locatie parquet table, if it exists
    :return: None
    """
    h = hdfs()
    if h.exists(LOCATIE):
        h.delete(LOCATIE)
コード例 #31
0
ファイル: parquet.py プロジェクト: BigUtrecht/BigUtrecht
def clearTelling():
    """
    Removes the telling parquet table, if it exists
    :return: None
    """
    h = hdfs()
    if h.exists(TELLING):
        h.delete(TELLING)
コード例 #32
0
ファイル: check_results.py プロジェクト: kikkomep/pydoop
def get_res(output_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(output_dir):
        if os.path.split(x['path'])[-1].startswith('part-'):
            with fs.open_file(x['path']) as f:
                data.append(f.read())
    all_data = ''.join(data)
    return pts.parse_mr_output(all_data, vtype=int)
コード例 #33
0
def get_res(output_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(output_dir):
        if os.path.split(x['path'])[-1].startswith('part-'):
            with fs.open_file(x['path'], 'rt') as f:
                data.append(f.read())
    all_data = ''.join(data)
    return pts.parse_mr_output(all_data, vtype=int)
コード例 #34
0
ファイル: distblast_pipes.py プロジェクト: 16NWallace/bcbb
 def __init__(self, context):
     super(FastaReader, self).__init__()
     self.logger = logging.getLogger(self.__class__.__name__)
     self.isplit = InputSplit(context.getInputSplit())
     self.host, self.port, self.fpath = split_hdfs_path(self.isplit.filename)
     self.fs = hdfs(self.host, self.port)
     self.file = self.fs.open_file(self.fpath, os.O_RDONLY)
     self._iterator = (SeqIO.parse(self.file, "fasta") if
                       self.isplit.offset == 0 else None)
コード例 #35
0
ファイル: check_results.py プロジェクト: xuande/pydoop
def compute_vc(input_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(input_dir):
        with fs.open_file(x['path']) as f:
            data.append(f.read())
    all_data = ''.join(data)
    vowels = re.findall('[AEIOUY]', all_data.upper())
    return Counter(vowels)
コード例 #36
0
ファイル: check_results.py プロジェクト: kikkomep/pydoop
def compute_vc(input_dir):
    fs = hdfs()
    data = []
    for x in fs.list_directory(input_dir):
        with fs.open_file(x['path']) as f:
            data.append(f.read())
    all_data = ''.join(data)
    vowels = re.findall('[AEIOUY]', all_data.upper())
    return Counter(vowels)
コード例 #37
0
ファイル: distblast_pipes.py プロジェクト: Pfiver/RNA-Seqlyze
 def __init__(self, context):
     super(FastaReader, self).__init__()
     self.logger = logging.getLogger(self.__class__.__name__)
     self.isplit = InputSplit(context.getInputSplit())
     self.host, self.port, self.fpath = split_hdfs_path(
         self.isplit.filename)
     self.fs = hdfs(self.host, self.port)
     self.file = self.fs.open_file(self.fpath, os.O_RDONLY)
     self._iterator = (SeqIO.parse(self.file, "fasta")
                       if self.isplit.offset == 0 else None)
コード例 #38
0
 def runTest(self):
     current_user = getpass.getuser()
     cwd = os.getcwd()
     os.chdir(tempfile.gettempdir())
     for user in None, current_user, "nobody":
         expected_user = current_user
         fs = hdfs.hdfs("", 0, user=user)
         self.assertEqual(fs.user, expected_user)
         fs.close()
     os.chdir(cwd)
コード例 #39
0
ファイル: parquet.py プロジェクト: BigUtrecht/BigUtrecht
def clearResults(name=""):
    """
    Clears target result parquet table name, or all result parquet tables if no name is given
    :param name: the target result parquet table name
    :return: None
    """
    p = path.join(RESULT_DIR, name)
    h = hdfs()
    if h.exists(p):
        h.delete(p)
コード例 #40
0
 def runTest(self):
     current_user = getpass.getuser()
     cwd = os.getcwd()
     os.chdir(tempfile.gettempdir())
     for user in None, current_user, "nobody":
         expected_user = current_user
         fs = hdfs.hdfs("", 0, user=user)
         self.assertEqual(fs.user, expected_user)
         fs.close()
     os.chdir(cwd)
コード例 #41
0
def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)

    # a global variable
    global producer

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"],
                             compression_type='gzip',
                             acks=1,
                             retries=2)

    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" % (myfile))
            continue

        elif myfile["kind"] == "file":
            pass

        else:
            raise Exception, "Unknown kind %s for %s" % (myfile["kind"],
                                                         myfile["name"])

        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" % (myfile))
            continue

        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" % (myfile))
            continue

        logger.info("Working on %s" % (myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)

        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" % (myfile["name"]))

        #sleep some time
        time.sleep(1)

    # for all files in HDFS
    producer.close()
コード例 #42
0
 def get_hosts(self):
     if hdfs.default_is_local():
         # only run on HDFS
         return
     hdfs.dump(self.data, self.hdfs_paths[0], mode="wb")
     fs = hdfs.hdfs("default", 0)
     hs = fs.get_hosts(self.hdfs_paths[0], 0, 10)
     self.assertTrue(len(hs) > 0)
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], -10,
                       10)
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
コード例 #43
0
ファイル: treewalk.py プロジェクト: xuande/pydoop
def main():
    fs = hdfs.hdfs()
    try:
        root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
        if not isdir(fs, root):
            sys.exit("%r does not exist" % root)
        print "BS(MB)\tBYTES"
        for k, v in usage_by_bs(fs, root).iteritems():
            print "%.1f\t%d" % (k / float(MB), v)
    finally:
        fs.close()
コード例 #44
0
ファイル: treewalk.py プロジェクト: CynthiaYiqingHuang/pydoop
def main():
    fs = hdfs.hdfs()
    try:
        root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
        if not isdir(fs, root):
            sys.exit("%r does not exist" % root)
        print "BS(MB)\tBYTES"
        for k, v in usage_by_bs(fs, root).iteritems():
            print "%.1f\t%d" % (k / float(MB), v)
    finally:
        fs.close()
コード例 #45
0
def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market != 0:
        logger = logging.getLogger("D_stock")
        logger_handler = logging.FileHandler("/tmp/D_stock.log")
        logger_handler.setFormatter(
            logging.Formatter("%(asctime)s -- %(message)s"))
        logger_handler.setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)
        logger.addHandler(logger_handler)
        logger.info(">" * 15 + code + ">" * 15)

        all_days = pd.date_range(start=str(time_to_market),
                                 end=dt.date.today(),
                                 freq="B")
        all_days = [x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving " + code + "@" + str(day) + "...")
            while True:
                try:
                    df = ts.get_tick_data(code, date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size > 3:
                dir_name = "/tmp/ticks/" + str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name = dir_name + "/" + str(day) + ".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s = hdfs.hdfs(host="spark-1", port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name, "./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<" * 15 + code + "<" * 15)
    return (socket.gethostname(), code)
コード例 #46
0
ファイル: test_hdfs.py プロジェクト: kikkomep/pydoop
 def get_hosts(self):
     if hdfs.default_is_local():
         # only run on HDFS
         return
     hdfs.dump(self.data, self.hdfs_paths[0])
     fs = hdfs.hdfs("default", 0)
     hs = fs.get_hosts(self.hdfs_paths[0], 0, 10)
     self.assertTrue(len(hs) > 0)
     self.assertRaises(
         ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10
     )
     self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
コード例 #47
0
ファイル: seqal_run.py プロジェクト: okulev/seal
    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug(
                "Properties:\n%s", "\n".join(
                    sorted([
                        "%s = %s" % (str(k), str(v))
                        for k, v in self.properties.iteritems()
                    ])))
        self.logger.info("Input: %s; Output: %s; reference: %s",
                         self.options.input, self.options.output,
                         self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.',
                                                   suffix=str(random.random()),
                                                   dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(
                    self.remote_bin_name)['name']

                return hadut.run_pipes(full_name,
                                       self.options.input,
                                       self.options.output,
                                       properties=self.properties,
                                       args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(
                        self.remote_bin_name
                    )  # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted",
                                      self.remote_bin_name)
                except:
                    self.logger.error(
                        "Error deleting the temporary pipes script %s from HDFS",
                        self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")
コード例 #48
0
def main(args):
    host, port, out_dir = hdfs.path.split(args.out_dir)
    fs = hdfs.hdfs(host, port)
    fs.create_directory(out_dir)
    join = os.path.join
    for dt, path in get_images(args.in_dir):
        out_path = join(out_dir, f"{dt.strftime(OUT_FMT)}.png")
        if not args.overwrite and fs.exists(out_path):
            continue
        with io.open(path, "rb") as fi:
            with fs.open_file(out_path, "wb") as fo:
                fo.write(fi.read())
コード例 #49
0
ファイル: common_hdfs_tests.py プロジェクト: kmatzen/pydoop
 def copy(self):
   local_fs = hdfs.hdfs('', 0)
   local_wd = make_wd(local_fs)
   from_path = os.path.join(local_wd, uuid.uuid4().hex)
   content = uuid.uuid4().hex
   with open(from_path, "w") as f:
     f.write(content)
   to_path = self._make_random_file()
   local_fs.copy(from_path, self.fs, to_path)
   local_fs.close()
   with self.fs.open_file(to_path) as f:
     self.assertEqual(f.read(), content)
   shutil.rmtree(local_wd)
コード例 #50
0
ファイル: kafka-producer.py プロジェクト: bunop/ccc-capstone
def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)
    
    # a global variable
    global producer 

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2)
    
    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" %(myfile))
            continue
        
        elif myfile["kind"] == "file":
            pass
        
        else:
            raise Exception, "Unknown kind %s for %s" %(myfile["kind"], myfile["name"])
            
        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        logger.info("Working on %s" %(myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)
            
        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" %(myfile["name"]))
        
        #sleep some time
        time.sleep(1)
                    
    # for all files in HDFS
    producer.close()
コード例 #51
0
ファイル: D_stock_cloud.py プロジェクト: tek-life/D_stock
def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market !=0:
	logger = logging.getLogger("D_stock")
	logger_handler=logging.FileHandler("/tmp/D_stock.log")
	logger_handler.setFormatter(logging.Formatter("%(asctime)s -- %(message)s"))
	logger_handler.setLevel(logging.DEBUG)
	logger.setLevel(logging.DEBUG)
	logger.addHandler(logger_handler)
        logger.info(">"*15+code+">"*15)

        all_days=pd.date_range(start=str(time_to_market),end=dt.date.today(),freq="B")
        all_days=[x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving "+code+"@"+str(day)+"...")
            while True:
                try:
                    df=ts.get_tick_data(code,date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size >3:
                dir_name="/tmp/ticks/"+str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name=dir_name+"/"+str(day)+".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s=hdfs.hdfs(host="spark-1",port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name,"./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<"*15+code+"<"*15)
    return (socket.gethostname(),code)
コード例 #52
0
ファイル: try_hdfs.py プロジェクト: ilveroluca/pydoop
def main(argv=sys.argv[1:]):
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument("--conf-dir", metavar="HADOOP_CONF_DIR")
  args = parser.parse_args(argv)
  if args.conf_dir:
    os.environ["HADOOP_CONF_DIR"] = os.path.abspath(args.conf_dir)
    hdfs.reset()
  fs = hdfs.hdfs()
  print "--- OPEN ---"
  dump_status(fs)
  print "cwd:", fs.working_directory()
  print
  fs.close()
  print "--- CLOSED ---"
  dump_status(fs)
コード例 #53
0
ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop
 def setUp(self):
   wd = tempfile.mkdtemp()
   wd_bn = os.path.basename(wd)
   self.local_wd = "file:%s" % wd
   fs = hdfs.hdfs("default", 0)
   fs.create_directory(wd_bn)
   self.hdfs_wd = fs.get_path_info(wd_bn)["name"]
   fs.close()
   basenames = ["test_path_%d" % i for i in xrange(2)]
   self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames]
   self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames]
   self.data = make_random_data(4*BUFSIZE + BUFSIZE/2)
   for path in self.local_paths:
     self.assertTrue(path.startswith("file:"))
   for path in self.hdfs_paths:
     if not hdfs.default_is_local():
       self.assertTrue(path.startswith("hdfs:"))
コード例 #54
0
ファイル: test_path.py プロジェクト: kikkomep/pydoop
 def stat(self):
     if hdfs.default_is_local():
         return
     bn = '%s%s' % (make_random_str(), UNI_CHR)
     fn = '/user/%s/%s' % (DEFAULT_USER, bn)
     fs = hdfs.hdfs("default", 0)
     p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn)
     with fs.open_file(fn, 'w') as fo:
         fo.write(make_random_str())
     info = fs.get_path_info(fn)
     fs.close()
     s = hdfs.path.stat(p)
     for n1, n2 in self.NMAP.iteritems():
         attr = getattr(s, n1, None)
         self.assertFalse(attr is None)
         self.assertEqual(attr, info[n2])
     self.__check_extra_args(s, info)
     self.__check_wrapper_funcs(p)
     hdfs.rmr(p)
コード例 #55
0
ファイル: workflow.py プロジェクト: ilveroluca/flink-pipeline
def _clean_up_bcl_output(output_dir):
    """
    Delete prq files with no data
    """
    host, port, _ = phdfs.path.split(output_dir)
    fs = phdfs.hdfs(host, port)
    count = 0
    for item in fs.walk(output_dir):
        if item['kind'] == 'file' and item['name'].endswith('.gz') and item['size'] < 30:
            if not item['name'].startswith('hdfs://'):
                raise RuntimeError("Insanity!  Tring to delete %s!", item['name'])
            fs.delete(item['name'], recursive=False)
            count += 1
    logger.info("Removed %d empty files from bcl output", count)

    undet_path = os.path.join(output_dir, 'Undetermined')
    if phdfs.path.exists(undet_path):
        logger.info("Removing reads from Undetermined dataset %s", undet_path)
        fs.delete(undet_path)
コード例 #56
0
ファイル: treegen.py プロジェクト: ZEMUSHKA/pydoop
def main(argv):
  
  try:
    depth = int(argv[1])
    span = int(argv[2])
  except IndexError:
    print "Usage: python %s DEPTH SPAN" % argv[0]
    sys.exit(2)

  fs = hdfs.hdfs()
  try:
    root = "%s/%s" % (fs.working_directory(), TEST_ROOT)
    try:
      fs.delete(root)
    except IOError:
      pass
    fs.create_directory(root)
    treegen(fs, root, depth, span)
  finally:
    fs.close()
コード例 #57
0
    def rename_compressed_files(self, file_table):
        # find the extension
        output_files = hdfs.ls(self.output_path)
        if len(output_files) == 0:
            return

        compressor_extension = self.get_compressor_extension(output_files)
        self.log.debug("compressor extension is %s", compressor_extension)

        hdfs_host, hdfs_port, _ = hdfs.path.split(output_files[0])
        if hdfs_host == '':
            is_local_fs = True
        else:
            is_local_fs = False
            output_hdfs = hdfs.hdfs(hdfs_host, hdfs_port)

        file_table.seek(0)
        for mapid, line in enumerate(file_table.xreadlines()):
            _, _, relative_output_name = line.rstrip('\n').split('\t')
            # we expect the map task ids to be assigned in the same order as the input
            # file list, so we can match the input file to an output file by its position
            # in the input file list.
            hadoop_output = os.path.join(self.output_path, "part-%05d" % mapid) + compressor_extension
            desired_file_name = os.path.join(self.output_path, relative_output_name) + compressor_extension
            if hadoop_output != desired_file_name:
                self.log.debug("renaming %s to %s", hadoop_output, desired_file_name)
                if is_local_fs:
                    # Though we could transparently use hdfs.move for both local fs and hdfs,
                    # using native methods for the local fs should be faster.
                    # os.renames automatically creates necessary parent directories for destination.
                    os.renames(urlparse(hadoop_output).path, urlparse(desired_file_name).path)
                else:
                    # create the output subdirectory, if necessary
                    dirname = os.path.dirname(relative_output_name)
                    if dirname:
                        output_hdfs.create_directory( os.path.join(self.output_path, dirname) )
                    if output_hdfs.exists(desired_file_name):
                        raise RuntimeError("Can't overwrite file in output directory: %s" % desired_file_name)
                    output_hdfs.move(hadoop_output, output_hdfs, desired_file_name)
コード例 #58
0
ファイル: test_opaque.py プロジェクト: crs4/pydoop
 def setUp(self):
     self.fs = hdfs()
     self.wd = utils.make_wd(self.fs)
コード例 #59
0
ファイル: mr_blast.py プロジェクト: crs4/vispa
def main(argv):

  parser = make_parser()
  opt, args = parser.parse_args()
  try:
    input_fasta = args[0]
    db_archive = args[1]
  except IndexError:
    parser.print_help()
    sys.exit(2)

  STR_GENERATOR.prefix = os.path.basename(input_fasta)

  logger = logging.getLogger()
  for h in logger.handlers:
    logger.removeHandler(h)
  opt.log_level_str = opt.log_level
  opt.log_level = getattr(logging, opt.log_level)
  kwargs = {'format': LOG_FORMAT,
            'datefmt': LOG_DATEFMT,
            'level': opt.log_level}
  if opt.log_file:
    kwargs['filename'] = opt.log_file
  logging.basicConfig(**kwargs)

  logger.debug("cli args: %r" % (args,))
  logger.debug("cli opts: %s" % opt)

  if opt.mr_dump_file:
    opt.mr_dump_file = open(opt.mr_dump_file, "w")
  else:
    opt.mr_dump_file = sys.stderr
  
  if not opt.blast_db:
    opt.blast_db = os.path.basename(db_archive).split(".", 1)[0]
    logger.info("--blast-db not provided: setting to %r" % opt.blast_db)
  
  os.environ["HADOOP_HOME"] = opt.hadoop_home
  if not opt.hadoop:
    opt.hadoop = os.path.join(opt.hadoop_home, "bin/hadoop")
  if not opt.hadoop_conf_dir:
    opt.hadoop_conf_dir = os.path.join(opt.hadoop_home, "conf")
  os.environ["HADOOP_CONF_DIR"] = opt.hadoop_conf_dir
  hdfs.reset()

  fs = hdfs.hdfs()
  logger.debug("hdfs params: host=%s, port=%d" % (fs.host, fs.port))
  lfs = hdfs.hdfs("", 0)
  runner = Runner(fs, lfs, logger)

  try:
    db_archive_hdfs = runner.upload_archive(db_archive)
    blast_input_hdfs = runner.run_f2t(input_fasta, opt)
    blast_output_hdfs = runner.run_blast(blast_input_hdfs, db_archive_hdfs,
                                         opt)
    runner.collect_output(blast_output_hdfs, opt)
    logger.info("all done")
  finally:
    lfs.close()
    fs.close()
    if opt.mr_dump_file is not sys.stderr:
      opt.mr_dump_file.close()
コード例 #60
0
ファイル: common_hdfs_tests.py プロジェクト: tivvit/pydoop
 def setUp(self):
     self.fs = hdfs.hdfs(self.hdfs_host, self.hdfs_port)
     self.wd = utils.make_wd(self.fs)