Ejemplo n.º 1
0
def write(idir, odir, remove, check, verbose):
    "Write files from given input area into HDFS"
    if  not os.path.isdir(idir):
        print("Source area %s does not exists" % idir)
        sys.exit(1)
    if  not hdfs.path.isdir(odir):
        print("Destination area on HDFS %s does not exists" % odir)
        print("Create it first with the following command")
        print("hadoop fs -mkdir %s" % odir)
        sys.exit(1)
    for name in os.listdir(idir):
        fname = os.path.join(idir, name)
        if  not (name.endswith('.avro') or \
            name.endswith('.avro.gz') or \
            name.endswith('.avro.bz2')):
            if  verbose:
                print("Skip %s" % fname)
            continue
        oname = hdfs_file(odir, name)
        if  not hdfs.path.isfile(oname):
            if  verbose:
                print("Migrate %s to %s" % (fname, oname))
            hdfs.put(fname, oname)
            if  check:
                fsize = os.stat(fname).st_size
                osize = hdfs.stat(oname).st_size
                if  fsize != osize:
                    print("Size %s (%s) != %s (%s)" % (fname, fsize, oname, osize))
                    sys.exit(1)
            if  remove:
                os.remove(fname)
Ejemplo n.º 2
0
def commit_file_compressed(srcfile, user_id, extension):
    dstfilename = get_filename(user_id, extension, create=True, hdfs_dest=USE_HDFS)

    with profiled("Uploading of output in %s"):
        # Atomic rename on POSIX
        log.msg("Renaming %s to %s" % (srcfile.name, dstfilename))
        srcfile.close()

        # Race condition here?
        if USE_HDFS:

            if hdfs.path.exists(dstfilename):
                if hdfs.path.exists(dstfilename + '.new'):
                    log.msg("Apparently a crashed worker left an unused file left")
                    hdfs_handle.delete(dstfilename + '.new')

                hdfs.put(srcfile.name, dstfilename + '.new')
                hdfs_handle.delete(dstfilename)
                hdfs_handle.rename(dstfilename + '.new', dstfilename)
            else:
                hdfs.put(srcfile.name, dstfilename)

            os.unlink(srcfile.name)
        else:
            os.rename(srcfile.name, dstfilename)
Ejemplo n.º 3
0
 def put(self):
     src = hdfs.path.split(self.local_paths[0])[-1]
     dest = self.hdfs_paths[0]
     with open(src, "w") as f:
         f.write(self.data)
     hdfs.put(src, dest)
     with hdfs.open(dest) as fi:
         rdata = fi.read()
     self.assertEqual(rdata, self.data)
Ejemplo n.º 4
0
def copy_file_2_remote_dir(remote_dir, log_file):
    LOGGER = logging.getLogger(__name__)
    suffix = time.strftime('%d-%m-%y_%H-%M-%S', time.gmtime(log_file.mtime))
    dest_filename = os.path.join(remote_dir, "{0}-{1}".format(log_file.filename, suffix))
    LOGGER.debug("Copying {0} to {1}".format(log_file.filepath, dest_filename))
    hdfs.put(log_file.filepath, dest_filename)
    LOGGER.debug("Copied {0} to HDFS".format(log_file.filepath))
    hdfs.chmod(dest_filename, BACKUP_PERMISSIONS)
    LOGGER.debug("Changed permissions for {0}".format(dest_filename))
Ejemplo n.º 5
0
 def put(self):
   src = hdfs.path.split(self.local_paths[0])[-1]
   dest = self.hdfs_paths[0]
   with open(src, "w") as f:
     f.write(self.data)
   hdfs.put(src, dest)
   with hdfs.open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)
Ejemplo n.º 6
0
 def copy_from_local_to_hdfs(self,src_local_location,dest_local_location):
     if(dest_local_location==""):
         print "Not a valid hdfs path"
         return False
     elif os.path.exists(src_local_location)==True:
         hdfs.put(src_local_location, dest_local_location)
         return True
     else:
         print "Local destination does not exist"
         return False
Ejemplo n.º 7
0
def get_mr_options(opt, wd):
    mr_options = BASE_MR_OPTIONS.copy()
    if opt.exclude_fn:
        exclude_bn = os.path.basename(opt.exclude_fn)
        exclude_fn = hdfs.path.abspath(hdfs.path.join(wd, exclude_bn))
        hdfs.put(opt.exclude_fn, exclude_fn)
        mr_options["mapred.cache.files"] = "%s#%s" % (exclude_fn, exclude_bn)
        mr_options["mapred.create.symlink"] = "yes"
        mr_options["ipcount.excludes"] = exclude_bn
    return mr_options
Ejemplo n.º 8
0
def get_mr_options(opt, wd):
  mr_options = BASE_MR_OPTIONS.copy()
  if opt.exclude_fn:
    exclude_bn = os.path.basename(opt.exclude_fn)
    exclude_fn = hdfs.path.abspath(hdfs.path.join(wd, exclude_bn))
    hdfs.put(opt.exclude_fn, exclude_fn)
    mr_options["mapred.cache.files"] = "%s#%s" % (exclude_fn, exclude_bn)
    mr_options["mapred.create.symlink"] = "yes"
    mr_options["ipcount.excludes"] = exclude_bn
  return mr_options
Ejemplo n.º 9
0
def copy_file_2_remote_dir(remote_dir, log_file):
    LOGGER = logging.getLogger(__name__)
    suffix = time.strftime('%d-%m-%y_%H-%M-%S', time.gmtime(log_file.mtime))
    dest_filename = os.path.join(remote_dir,
                                 "{0}-{1}".format(log_file.filename, suffix))
    LOGGER.debug("Copying {0} to {1}".format(log_file.filepath, dest_filename))
    hdfs.put(log_file.filepath, dest_filename)
    LOGGER.debug("Copied {0} to HDFS".format(log_file.filepath))
    hdfs.chmod(dest_filename, BACKUP_PERMISSIONS)
    LOGGER.debug("Changed permissions for {0}".format(dest_filename))
Ejemplo n.º 10
0
Archivo: hadut.py Proyecto: crs4/pydoop
 def set_input(self, input_, put=False):
     """
     Set the input path for the job.  If ``put`` is :obj:`True`, copy
     (local) ``input_`` to the working directory.
     """
     if put and self.wd:
         self.logger.info("copying input data to HDFS")
         hdfs.put(input_, self.input)
     else:
         self.input = input_
         self.logger.info("assigning input to %s", self.input)
Ejemplo n.º 11
0
 def set_input(self, input_, put=False):
     """
 Set the input path for the job.  If ``put`` is :obj:`True`, copy
 (local) ``input_`` to the working directory.
 """
     if put and self.wd:
         self.logger.info("copying input data to HDFS")
         hdfs.put(input_, self.input)
     else:
         self.input = input_
         self.logger.info("assigning input to %s" % self.input)
Ejemplo n.º 12
0
def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market != 0:
        logger = logging.getLogger("D_stock")
        logger_handler = logging.FileHandler("/tmp/D_stock.log")
        logger_handler.setFormatter(
            logging.Formatter("%(asctime)s -- %(message)s"))
        logger_handler.setLevel(logging.DEBUG)
        logger.setLevel(logging.DEBUG)
        logger.addHandler(logger_handler)
        logger.info(">" * 15 + code + ">" * 15)

        all_days = pd.date_range(start=str(time_to_market),
                                 end=dt.date.today(),
                                 freq="B")
        all_days = [x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving " + code + "@" + str(day) + "...")
            while True:
                try:
                    df = ts.get_tick_data(code, date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size > 3:
                dir_name = "/tmp/ticks/" + str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name = dir_name + "/" + str(day) + ".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s = hdfs.hdfs(host="spark-1", port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name, "./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<" * 15 + code + "<" * 15)
    return (socket.gethostname(), code)
Ejemplo n.º 13
0
 def saveStockFile(self, stock_name, data_frame, hdfs_path):
     print("saving stock " + stock_name)
     outFile = self.fileLocalOutput + stock_name + ".csv"
     export_csv = data_frame.to_csv(outFile, index=None, header=True)
     print(outFile)
     from_path = outFile
     if (hdfs_path != ""):
         print(hdfs_path)
         #to_path ='hdfs://localhost:9000/user/xavier/US_Stocks/'+stock_name+'.csv'
         to_path = hdfs_path + stock_name + '.csv'
         print(from_path + "==>" + to_path)
         hdfs.put(from_path, to_path)
         os.remove(outFile)
Ejemplo n.º 14
0
def Get_stock_ticks(code, time_to_market):
    import tushare as ts
    import pandas as pd
    import logging
    import datetime as dt
    import os
    import socket
    import pydoop.hdfs as hdfs
    import shutil

    if time_to_market !=0:
	logger = logging.getLogger("D_stock")
	logger_handler=logging.FileHandler("/tmp/D_stock.log")
	logger_handler.setFormatter(logging.Formatter("%(asctime)s -- %(message)s"))
	logger_handler.setLevel(logging.DEBUG)
	logger.setLevel(logging.DEBUG)
	logger.addHandler(logger_handler)
        logger.info(">"*15+code+">"*15)

        all_days=pd.date_range(start=str(time_to_market),end=dt.date.today(),freq="B")
        all_days=[x.date() for x in all_days]
        for day in all_days[::-1]:
            logger.info("Saving "+code+"@"+str(day)+"...")
            while True:
                try:
                    df=ts.get_tick_data(code,date=day)
                except Exception as e:
                    print e
                    continue
                break

            if df.index.size >3:
                dir_name="/tmp/ticks/"+str(code)
                if not os.path.exists(dir_name):
                    os.makedirs(dir_name)

                file_name=dir_name+"/"+str(day)+".csv"
                df.to_csv(file_name)
        """
        Write to HDFS        
        """
        if os.path.exists(dir_name):
            s=hdfs.hdfs(host="spark-1",port=9000)
            if not s.exists("ticks"):
                s.create_directory("ticks")
            hdfs.put(dir_name,"./ticks/")
            shutil.rmtree(dir_name)

        logger.info("<"*15+code+"<"*15)
    return (socket.gethostname(),code)
def upsert_a_folder(src_dir, hdfs_tgt_dir, filename, debug):
    src_fname = os.path.join(src_dir, filename)
    tgt_fname = os.path.join(hdfs_tgt_dir, filename)
    # get target file info
    tgt_dict = {}
    try:
        lsl = hdfs.lsl(hdfs_tgt_dir)
        for i in lsl:
            try:
                tgt_dict[os.path.basename(i["name"])] = i["last_mod"]
            except:
                pass
    except:
        pass
    print "hdfs tgt_dict=", tgt_dict

    # get source info
    src_fs = glob.glob(src_fname)
    print "src_fs=", src_fs
    for sf in src_fs:
        # get source file info
        try:
            src_ctime_int = int(os.path.getctime(sf))
        except:
            src_ctime_int = None
        print "src_ctime_int=", src_ctime_int

        src_bfname = os.path.basename(sf)
        tgt_fname = os.path.join(hdfs_tgt_dir, src_bfname)
        # put or rm/put
        try:
            if not src_bfname in tgt_dict:
                #insert new one
                if debug == 'N':
                    hdfs.put(sf, hdfs_tgt_dir)
                else:
                    print "DEBUG: put ", src_bfname, "to", hdfs_tgt_dir
            elif src_ctime_int > tgt_dict[src_bfname]:
                if debug == 'N':
                    hdfs.rmr(tgt_fname)
                    hdfs.put(sf, hdfs_tgt_dir)
                else:
                    print "DEBUG: replace ", tgt_fname, "by", sf
            else:
                print tgt_fname, "has a newer mdate than", sf, ":", src_ctime_int
        except:
            e = sys.exc_info()[0]
            print "Error: ", e
Ejemplo n.º 16
0
def copy_to_hdfs(local_path,
                 relative_hdfs_path,
                 overwrite=False,
                 project=None):
    """
    Copies a path from local filesystem to HDFS project (recursively) using relative path in $CWD to a path in hdfs (hdfs_path)

    For example, if you execute:

    >>> copy_to_hdfs("data.tfrecords", "/Resources", project="demo")

    This will copy the file data.tfrecords to hdfs://Projects/demo/Resources/data.tfrecords

    Args:
        :local_path: Absolute or local path on the local filesystem to copy
        :relative_hdfs_path: a path in HDFS relative to the project root to where the local path should be written
        :overwrite: a boolean flag whether to overwrite if the path already exists in HDFS
        :project: name of the project, defaults to the current HDFS user's project
    """
    if project == None:
        project = project_name()

    if "PDIR" in os.environ:
        full_local = os.environ['PDIR'] + '/' + local_path
    else:
        # Absolute path
        if local_path.startswith(os.getcwd()):
            full_local = local_path
        else:
            # Relative path
            full_local = os.getcwd() + '/' + local_path

    hdfs_path = _expand_path(relative_hdfs_path, project, exists=False)

    if overwrite:
        hdfs_path = hdfs_path + "/" + os.path.basename(full_local)
        if exists(hdfs_path):
            # delete hdfs path since overwrite flag was set to true
            delete(hdfs_path, recursive=True)

    print("Started copying local path {} to hdfs path {}\n".format(
        local_path, hdfs_path))

    # copy directories from local path to HDFS project path
    hdfs.put(full_local, hdfs_path)

    print("Finished copying\n")
Ejemplo n.º 17
0
def word_count():
    wordDict = defaultdict(int)
    filename = open("/home/marcus/tasks/Shakespeare.txt","r").read()
    filename = filename.lower()
    for ch in '"''!@#$%^&*()-_=+,<.>/?;:[{]}~`\|':
        filename = filename.replace(ch," ")
    for word in filename.split():
        if word not in wordDict:
            wordDict[word] = 1
        else:
            wordDict[word] = wordDict[word] + 1
    #print(wordDict["the"])

    with open('/home/marcus/Mindbender_BD/task2/python_output.txt', 'w') as file:
         file.write(json.dumps(wordDict))

    from_path = "/home/marcus/Mindbender_BD/task2/python_output.txt"
    to_path ='hdfs://localhost:9000/task2/outfile.txt'
    hdfs.put(from_path, to_path)
Ejemplo n.º 18
0
    def setup(self):
        """
        * Creates an hdfs directory with the name of this test (self.make_hdfs_test_path())
        * uploads the local 'input' directory into the hdfs directory
        """
        self.logger.debug("Test setup")
        #hadut.run_hadoop_cmd_e("dfsadmin", args_list=["-safemode", "wait"])
        #self.logger.debug("hdfs out of safe mode")

        if hdfs.path.exists(self.make_hdfs_test_path()):
            error_msg = "hdfs test path '%s' already exists.  Please remove it" % self.make_hdfs_test_path()
            self.logger.fatal(error_msg)
            raise RuntimeError(error_msg)
        hdfs.mkdir(self.make_hdfs_test_path())
        local_input = self.make_local_input_path()
        hdfs_input = self.make_hdfs_input_path()
        hdfs.put(local_input, hdfs_input)
        self.logger.info("Copied local input %s to %s", local_input, hdfs_input)
        self.logger.debug("Setup complete")
Ejemplo n.º 19
0
def copy_to_hdfs(local_path,
                 relative_hdfs_path,
                 overwrite=False,
                 project=None):
    """
    Copies a path from local filesystem to HDFS project (recursively) using relative path in $CWD to a path in hdfs (hdfs_path)

    For example, if you execute:

    >>> copy_to_hdfs("data.tfrecords", "/Resources/", project="demo")

    This will copy the file data.tfrecords to hdfs://Projects/demo/Resources/data.tfrecords

    Args:
        :local_path: the path on the local filesystem to copy
        :relative_hdfs_path: a path in HDFS relative to the project root to where the local path should be written
        :overwrite: a boolean flag whether to overwrite if the path already exists in HDFS
        :project: name of the project, defaults to the current HDFS user's project
    """
    if project == None:
        project = project_name()

    if "PDIR" in os.environ:
        full_local = os.environ['PDIR'] + '/' + local_path
    else:
        full_local = os.getcwd() + '/' + local_path

    hdfs_path = _expand_path(relative_hdfs_path, project, exists=False)

    if overwrite:
        hdfs_handle = get()
        split = local_path.split('/')
        filename = split[len(split) - 1]
        if filename == '/':
            filename = split[len(split) - 2]
        full_project_path = hdfs_path + '/' + filename

        # check if project path exist, if so delete it (since overwrite flag was set to true)
        if hdfs_handle.exists(full_project_path):
            hdfs_handle.delete(full_project_path, recursive=True)

    # copy directories from local path to HDFS project path
    hdfs.put(full_local, hdfs_path)
Ejemplo n.º 20
0
def capture(outpath, max_count='3'):
    """
    fab cam.capture:/tmp/cam1,3
    """
    max_count = int(max_count)
    import os
    import cv2
    import copy
    import pydoop.hdfs as hdfs

    cv2.namedWindow('Window1')
    vc = cv2.VideoCapture()
    vc.open(0)
    skip = 50
    max_count *= skip
    basename = os.path.basename(outpath)
    count = 1
    hdfs.mkdir('hdfs://gnn-f02-01' + outpath)
    while True:
        retval, image = vc.read()
        try:
            if count % skip == 0:
                tmpImage = copy.copy(image)
                filename = '%05d.jpg' % (count / skip)
                hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals(
                )
                cv2.putText(tmpImage, filename, (50, 50),
                            cv2.FONT_HERSHEY_SIMPLEX, 2, 2)
                cv2.imshow('Windows1', tmpImage)
                cv2.waitKey(1)
                cv2.imwrite(basename + '_' + filename, image)
                hdfs.put(basename + '_' + filename, hdfspath)
                print basename + '_' + filename, hdfspath
            else:
                cv2.imshow('Windows1', image)
                cv2.waitKey(1)
        except KeyboardInterrupt:
            break
        count += 1
        if 0 < max_count < count:
            break
    vc.release()
    cv2.destroyWindow('Window1')
Ejemplo n.º 21
0
def capture(outpath, max_count='3'):
    """
    fab cam.capture:/tmp/cam1,3
    """
    max_count = int(max_count)
    import os
    import cv2
    import copy
    import pydoop.hdfs as hdfs

    cv2.namedWindow('Window1')
    vc = cv2.VideoCapture()
    vc.open(0)
    skip = 50
    max_count *= skip
    basename = os.path.basename(outpath)
    count = 1
    hdfs.mkdir('hdfs://gnn-f02-01' + outpath)
    while True:
        retval, image = vc.read()
        try:
            if count % skip == 0:
                tmpImage = copy.copy(image)
                filename = '%05d.jpg' % (count / skip)
                hdfspath = 'hdfs://gnn-f02-01%(outpath)s/%(filename)s' % locals()
                cv2.putText(tmpImage, filename, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, 2)
                cv2.imshow('Windows1', tmpImage)
                cv2.waitKey(1)
                cv2.imwrite(basename + '_' + filename, image)
                hdfs.put(basename + '_' + filename, hdfspath)
                print basename + '_' + filename, hdfspath
            else:
                cv2.imshow('Windows1', image)
                cv2.waitKey(1)
        except KeyboardInterrupt:
            break
        count += 1
        if 0 < max_count < count:
            break
    vc.release()
    cv2.destroyWindow('Window1')
Ejemplo n.º 22
0
  def __setup_remote_paths(self):
    """
    Actually create the working directory and copy the module into it.

    Note: the script has to be readable by Hadoop; though this may not
    generally be a problem on HDFS, where the Hadoop user is usually
    the superuser, things may be different if our working directory is
    on a shared POSIX filesystem.  Therefore, we make the directory
    and the script accessible by all.
    """
    pipes_code = self.__generate_pipes_code()
    hdfs.mkdir(self.remote_wd)
    hdfs.chmod(self.remote_wd, "a+rx")
    hdfs.dump(pipes_code, self.remote_exe)
    hdfs.chmod(self.remote_exe, "a+rx")
    hdfs.put(self.args.module, self.remote_module)
    hdfs.chmod(self.remote_module, "a+r")
    self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
    self.logger.debug("Created remote paths:")
    self.logger.debug(self.remote_wd)
    self.logger.debug(self.remote_exe)
    self.logger.debug(self.remote_module)
Ejemplo n.º 23
0
    def __setup_remote_paths(self):
        """
    Actually create the working directory and copy the module into it.

    Note: the script has to be readable by Hadoop; though this may not
    generally be a problem on HDFS, where the Hadoop user is usually
    the superuser, things may be different if our working directory is
    on a shared POSIX filesystem.  Therefore, we make the directory
    and the script accessible by all.
    """
        pipes_code = self.__generate_pipes_code()
        hdfs.mkdir(self.remote_wd)
        hdfs.chmod(self.remote_wd, "a+rx")
        hdfs.dump(pipes_code, self.remote_exe)
        hdfs.chmod(self.remote_exe, "a+rx")
        hdfs.put(self.args.module, self.remote_module)
        hdfs.chmod(self.remote_module, "a+r")
        self.__warn_user_if_wd_maybe_unreadable(self.remote_wd)
        self.logger.debug("Created remote paths:")
        self.logger.debug(self.remote_wd)
        self.logger.debug(self.remote_exe)
        self.logger.debug(self.remote_module)
Ejemplo n.º 24
0
def run_task(factory, port=None, istream=None, ostream=None,
             private_encoding=True, context_class=TaskContext,
             cmd_file=None, fast_combiner=False, auto_serialize=True):
    """
    Run the assigned task in the framework.

    :rtype: bool
    :return: :obj:`True` if the task succeeded.
    """
    connections = resolve_connections(
        port, istream=istream, ostream=ostream, cmd_file=cmd_file,
        auto_serialize=auto_serialize
    )
    context = context_class(connections.up_link,
                            private_encoding=private_encoding,
                            fast_combiner=fast_combiner)
    stream_runner = StreamRunner(factory, context, connections.cmd_stream)
    pstats_dir = os.getenv(PSTATS_DIR)
    if pstats_dir:
        pstats_fmt = os.getenv(PSTATS_FMT, DEFAULT_PSTATS_FMT)
        hdfs.mkdir(pstats_dir)
        fd, pstats_fn = tempfile.mkstemp(suffix=".pstats")
        os.close(fd)
        cProfile.runctx("stream_runner.run()",
                        {"stream_runner": stream_runner}, globals(),
                        filename=pstats_fn)
        name = pstats_fmt % (
            "r" if context.is_reducer() else "m",
            context.get_task_partition(), os.path.basename(pstats_fn)
        )
        hdfs.put(pstats_fn, hdfs.path.join(pstats_dir, name))
    else:
        stream_runner.run()
    context.close()
    connections.close()
    return True
def upsert_a_file(src_dir, hdfs_tgt_dir, filename, debug):
    src_fname = os.path.join(src_dir, filename)
    tgt_fname = os.path.join(hdfs_tgt_dir, filename)
    # get source file info
    try:
        src_ctime_int = int(os.path.getctime(src_fname))
    except:
        src_ctime_int = None
    print "src_ctime_int=", src_ctime_int
    # get target file info
    try:
        tgt_stat = hdfs.stat(tgt_fname)
        tgt_mtime = tgt_stat.st_mtime
    except:
        tgt_mtime = None
    print "tgt_mtime=", tgt_mtime

    # put or rm/put
    try:
        if tgt_mtime is None:
            #insert new one
            if debug == 'N':
                hdfs.put(src_fname, hdfs_tgt_dir)
            else:
                print "DEBUG: put ", src_fname, "to", hdfs_tgt_dir
        elif src_ctime_int > tgt_mtime:
            if debug == 'N':
                hdfs.rmr(tgt_fname)
                hdfs.put(src_fname, hdfs_tgt_dir)
            else:
                print "DEBUG: replace ", tgt_fname, "by", src_fname
        else:
            print tgt_fname, "has a newer mdate:", tgt_mtime, "than", src_fname, ":", src_ctime_int
    except:
        e = sys.exc_info()[0]
        print "Error({0}): {1}".format(e.errno, e.strerror)
Ejemplo n.º 26
0
def main(file_name, **kwargs):
    # check each file against the registry
    # determine if its spam, a duplicate, or should be ingested
    # copy the file to the appropriate place, check for equal file size, and delete one of the files appropriately
    metadata = {}
    write_path = instance_guid = stage = header = ""
    file_type = "raw"
    # json.dumps(metadata)

    # Log the PID to help in debugging
    logger.info('Pid : ' + str(os.getpid()))
    try:
        # attempt to get the registry entry.  If Alfred isn't working properly we'll get a connection error
        if file_name.startswith('sbx_'):
            # asking the registry for sandbox file and stripping "sbx_" off the file name, keys don't have the prefix
            metadata = reg.get_metadata(file_name[4:], stage='sandbox')
        else:
            metadata = reg.get_metadata(file_name)

    except requests.ConnectionError as e:
        # log response error
        logger.error('Failed to connect to Alfred : ' + str(e))
        exit(e)

    if 'stage' in metadata:
        stage = metadata['stage']

    # get the count of the number of rows in the source file
    row_count = file_len(landing_zone + '/' + file_name)
    logger.info('row count = ' + str(row_count))

    if 'file' in metadata and metadata['file'] != {}:
        # a registry entry exists for the file, process it
        logger.info("Moving " + file_name + " to hdfs://" +
                    reg.file_path(metadata, **kwargs))

        # set the write path based on the metadata
        write_path = reg.file_path(metadata, **kwargs)
        logger.info("Moving " + file_name + " to " + write_path)

        if stage == 'sandbox' and hdfs.path.exists(write_path + '/' +
                                                   file_name):
            # in the case of sandbox files previous data is always overwritten
            logging.info("Sandbox file already exists, overwriting")
            # Delete from HDFS is not strictly needed if the table was created as external
            hdfs.rmr(write_path + '/' + file_name)
            # set up a hive connection
            hive = validator.Hive()
            # use the hive connection to delete the sandbox table
            hive.drop_table(metadata, stage=stage)
            # close the hive connection
            hive = None

        # check to make sure the file doesn't already exist
        if not hdfs.path.exists(write_path + '/' + file_name):
            # if it doesn't, write it to the appropriate location
            hdfs.put(landing_zone + '/' + file_name,
                     write_path + '/' + file_name)
            # create second copy for work table unless its a sandbox file
            if stage != 'sandbox':
                # create work copy write path
                work_write_path = reg.file_path(metadata,
                                                type='work',
                                                **kwargs)
                # delete the work file if there is already one present
                if hdfs.path.exists(work_write_path):
                    logger.info("Deleting existing work files at  " +
                                work_write_path)
                    hdfs.rmr(work_write_path)
                # write the file to the work file location
                hdfs.put(landing_zone + '/' + file_name,
                         work_write_path + '/' + file_name)
            else:
                # if this is a sandbox file, we might need the header row, its far easier to get this now than from hdfs
                header = get_header(file_name)
            # register that the raw file was written
            instance_guid = reg.register_raw(metadata, file_name, file_type,
                                             row_count)
        else:
            # if the file does exist, its treated as a duplicate
            logger.info("Duplicate file")
            file_type = "duplicate"

            # set up duplicate write path
            write_path = reg.dup_file_path(
                metadata)  # + '/' + metadata['file']['key']

            #check to see if its a duplicate of an existing duplicate
            if hdfs.path.exists(write_path + '/' + file_name):
                # delete existing duplicate and write the new one.
                logging.info("duplicate file already exists, overwriting")
                hdfs.rmr(write_path + '/' + file_name)
                hdfs.put(landing_zone + '/' + file_name,
                         write_path + '/' + file_name)
                logger.info("writing duplicate file " + write_path + '/' +
                            file_name)
                reg.register_raw(metadata, file_name, file_type, row_count)

            else:
                # first time duplicates just get written
                hdfs.put(landing_zone + '/' + file_name,
                         write_path + '/' + file_name)
                logger.info("writing duplicate file " + write_path + '/' +
                            file_name)
                reg.register_raw(metadata, file_name, file_type, row_count)

    else:
        # no registry entry for this file, move it to spam
        file_type = "spam"

        # set up write path for spam
        write_path = reg.spam_file_path(metadata)
        logger.info("Moving " + file_name + " to " + write_path + '/' +
                    file_name)

        #check to see if its a duplicate of an existing spam file
        if hdfs.path.exists(write_path + '/' + file_name):
            # delete existing spam and write the new one.
            logging.info("spam file already exists, overwriting")
            hdfs.rmr(write_path + '/' + file_name)
            hdfs.put(landing_zone + '/' + file_name,
                     write_path + '/' + file_name)
            logger.info("writing spam file " + write_path + '/' + file_name)
            reg.register_raw(metadata, file_name, file_type, row_count)
        else:
            # first time spam gets written as normal
            hdfs.put(landing_zone + '/' + file_name,
                     write_path + '/' + file_name)
            logger.info("writing spam file " + write_path + '/' + file_name)
            reg.register_raw(metadata, file_name, file_type, row_count)

    # confirm that source file and target file have the same size, regardless of spam, duplicate or normal
    if hdfs.path.exists(write_path + '/' + file_name) and \
            hdfs.path.getsize(write_path + '/' + file_name) == os.stat(landing_zone + '/' + file_name).st_size:
        # if the file sizes match, delete the source file
        os.remove(landing_zone + '/' + file_name)
        logger.info("Landing zone file removed " + landing_zone + '/' +
                    file_name)
    else:
        # if the file sizes do not match, delete the target file and rename the source file so it doesn't get reprocessed repeatedly
        logger.error(
            "Source and target file sizes didn't match, not deleting source.")
        hdfs.rmr(write_path + '/' + file_name)
        os.rename(landing_zone + '/' + file_name,
                  landing_zone + '/' + file_name + '.err')
        raise ValueError("Source and target file sizes don't match")

    # copy only is an option set up in case there's ever a reason not to process beyond moving the file to HDFS
    if 'copy_only' not in kwargs or not kwargs['copy_only']:
        if file_type == "raw":  # raw, meaning not spam or duplicate. No reason to validate those
            if stage != 'sandbox':
                # if its not a sandbox file proceed with full validation
                logger.info("Validate " + file_name)
                validator.main(file_name, instance_guid, metadata)
            elif stage == 'sandbox':
                # if it is a sandbox file, we need to mark it as such so validator only creates the table
                logger.info("Sandbox validate " + file_name)
                validator.main(file_name,
                               instance_guid,
                               metadata,
                               header=header,
                               stage=stage)

    # log that this PID is ending
    logger.info('Pid ending : ' + str(os.getpid()))
Ejemplo n.º 27
0
# Create a set to exclude punctuations

exclude = set(string.punctuation)

# For each string in the text file, remove unwanted character
for i in txtFile:
    i = ''.join(ch for ch in i if ch not in exclude)
    newlist.append(i)

# Convert all words in the list to lower case
wordlist = [w.lower() for w in newlist]

dic = dict()

for word in wordlist:
    if word in dic:
        dic[word] += 1
    else:
        dic[word] = 1

dic

result = sorted(dic.items(), key=lambda x: x[1])

with open("sample.txt", "w") as outfile:
    json.dump(result, outfile)
file_path = "/home/user/sample.txt"
finalFile = "hdfs://localhost:9000/Test_002"
hdfs.put(file_path, finalFile)
Ejemplo n.º 28
0
 def put(self, source, destination):
     hdfs.put(source, destination)
Ejemplo n.º 29
0
import os

import glob

directoryPath = 'data/'
for file_name in glob.glob(directoryPath + '*.csv'):
    print(file_name)
    arr = file_name.split('/')
    fname = arr[1]
    b = hdfs.path.isdir("/data")

    if b == True:
        hdfs_client = hdfs.hdfs()
        data_list = hdfs_client.list_directory("/data")
        print(data_list)

        for item in data_list:
            print(item["name"])
            if fname in item["name"]:
                print("rm -->", item["name"])
                hdfs.rm(item["name"], recursive=True, user=None)

        print("---after rm ---")
        data_list = hdfs_client.list_directory("/data")
        print(data_list)

        hdfs.put(file_name, "/data")
        print("---after put ---")
        data_list = hdfs_client.list_directory("/data")
        print(data_list)
Ejemplo n.º 30
0
def main(argv):

    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)

    with Timer() as total_time:

        parser = make_parser()
        args = parser.parse_args(argv)
        if args.dataset:
            print args.dataset
            create_dataset(logger, args.dataset)

        if args.script:
            piped_code_file = args.script
        else:
            piped_code_file = DEFAULT_SCRIPT

        if not os.path.exists(piped_code_file):
            raise IOError("script {0} not found !!!".format(piped_code_file))

        with open(piped_code_file) as f:
            pipes_code = pts.add_sys_path(f.read())

        dataset = [d for d in os.listdir("dataset") if d.endswith("MB")]
        dataset.sort(cmp=lambda x, y: cmp(
            int(x.replace("MB", "")), int(y.replace("MB", ""))
        ))

        logger.info(" Uploading dataset: { %s }", ', '.join(dataset))
        if not hadut.path_exists(os.path.join(DATASET_DIR)):
            logger.info("  dataset folder created")
            hdfs.mkdir(DATASET_DIR)

        for data_filename in dataset:
            source_path = os.path.join(DATASET_DIR, data_filename)
            dest_path = os.path.join(DATASET_DIR, data_filename)

            if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)):
                logger.info(" -> uploading %s...", source_path)
                hdfs.put(source_path, dest_path)

        update_conf(args)

        results = dict()
        for data_input in dataset:

            with Timer() as t:
                runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
                logger.info("Running the script %s with data input %s..",
                            piped_code_file, data_input)
                data_input_path = os.path.join(DATASET_DIR, data_input)
                runner.set_input(data_input_path, put=False)
                runner.set_exe(pipes_code)
                runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR,
                           logger=logger)
                res = runner.collect_output()
                print data_input_path
                local_wc = pts.LocalWordCount(data_input_path)
                logging.info(local_wc.check(res))
                #print res
                #runner.clean()
            results[data_input] = (t.secs, t.msecs)

    print "\n\n RESULTs"
    print "=" * (len(piped_code_file) + 15)
    print " *  script: {0}".format(piped_code_file)
    print " *  mappers: {0}".format(CONF["mapred.map.tasks"])
    print " *  reducers: {0}".format(CONF["mapred.reduce.tasks"])
    print " *  dataset: [{0}]".format(",".join(dataset))
    print " *  times (input -> secs):"
    for data_input in dataset:
        print "    - {0} -> {1} secs.".format(
            data_input, results[data_input][0]
        )
    print "\n => Total execution time: {0}".format(total_time.secs)
    print "=" * (len(piped_code_file) + 15)
    print "\n"
Ejemplo n.º 31
0
def out_to_dfs(file, dfs_path):
    print("Writing file to HDFS...")
    hdfs.put(file, dfs_path)
def main():  # ============= =============  ============= =============
    # parse arguments
    parser = ArgumentParser(description=__description__)
    args = arg_parser(parser)

    if args.in_zipfname:
        in_zipfname = args.in_zipfname
    else:
        in_zipfname = 'data_test.zip'
    if args.outdir:
        outdir = args.outdir
    else:
        outdir = 'out'
    if args.outfname:
        outfname = args.outfname
    else:
        outfname = 'outfname'
    if args.dfs_folder:
        dfs_folder = args.dfs_folder
    else:
        dfs_folder = None

    if args.row_id:
        row_id_str = args.row_id
    else:
        row_id_str = "0"

    # log time ================================================================ ================
    t0 = time()

    print "outdir=", outdir

    # get input zip file name
    outfname = os.path.basename(in_zipfname)
    # get file name and its extension suffix
    (root, ext) = os.path.splitext(outfname)

    # set output file name
    outfname = root + ".gz"
    # get output file handle
    zout = create_zfile(outdir, outfname)

    # input file is .zip file with folders inside
    zin = zipfile.ZipFile(in_zipfname, "r")
    count = 0

    folder_list = []

    # open each file in .zip file, parse it  and save to .gz file
    #for filename in zin.namelist():
    for info in zin.infolist():
        # get filename from zin
        filename = info.filename
        #dd=info.date_time
        #print "f=",filename,", dt=",datetime.datetime(*dd)

        meta_list = []
        # get folder name
        folder, name = filename.split('/')
        #print "--",folder," --",filename

        # assume first level folders are labels
        # collect labels
        if not folder in folder_list:
            folder_list.append(folder)

        # transform here ======================
        if len(name) > 0:  # exclude folder name
            content = zin.read(filename)
            if len(content) <= 0:
                print "Content not found for [" + filename + "]"
            else:
                # count files
                count = count + 1
                # label
                meta_list.append(folder)
                # md5;  assume file name is md5
                bname = os.path.basename(filename)
                (namep, ext) = os.path.splitext(bname)
                meta_list.append(namep)
                # date of file
                meta_list.append(str(datetime.datetime(*info.date_time)))
                #print "meta_list=",meta_list
                #print "content=",len(content)," type=",type(content)
                #print "bname=",bname

                # for zip file; write to different files
                #zout.writestr(bname, format_content(meta_list, content))

                # write to .gz file;
                zout.write(format_content(meta_list, content))

                # allow 100 samples in .gz file; create the other file
                if count % 100 == 0:
                    zout.close()
                    outfname = root + "_" + str(count) + ".gz"
                    zout = create_zfile(outdir, outfname)

    zout.close()
    zin.close()
    #print "folder_list=",folder_list

    #upload to HDFS
    if dfs_folder:
        # clean up folder
        dfs_folder = os.path.join(HDFS_RETR_DIR, dfs_folder)
        print "dfs_folder=", dfs_folder
        try:
            hdfs.rmr(dfs_folder)
        except:
            e = sys.exc_info()[0]
            print "Warning: delete hdfs error: ", e
            pass

        try:
            hdfs.put(outdir, dfs_folder)
        except:
            e = sys.exc_info()[0]
            print "Error: Put files error.", e

    t1 = time()
    print 'running time: %f' % (t1 - t0)
    return 0
    '''
Ejemplo n.º 33
0
def main(argv):

    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)

    with Timer() as total_time:

        parser = make_parser()
        args = parser.parse_args(argv)
        if args.dataset:
            print args.dataset
            create_dataset(logger, args.dataset)

        if args.script:
            piped_code_file = args.script
        else:
            piped_code_file = DEFAULT_SCRIPT

        if not os.path.exists(piped_code_file):
            raise IOError("script {0} not found !!!".format(piped_code_file))

        with open(piped_code_file) as f:
            pipes_code = pts.add_sys_path(f.read())

        dataset = [d for d in os.listdir("dataset") if d.endswith("MB")]
        dataset.sort(cmp=lambda x, y: cmp(int(x.replace("MB", "")),
                                          int(y.replace("MB", ""))))

        logger.info(" Uploading dataset: { %s }", ', '.join(dataset))
        if not hadut.path_exists(os.path.join(DATASET_DIR)):
            logger.info("  dataset folder created")
            hdfs.mkdir(DATASET_DIR)

        for data_filename in dataset:
            source_path = os.path.join(DATASET_DIR, data_filename)
            dest_path = os.path.join(DATASET_DIR, data_filename)

            if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)):
                logger.info(" -> uploading %s...", source_path)
                hdfs.put(source_path, dest_path)

        update_conf(args)

        results = dict()
        for data_input in dataset:

            with Timer() as t:
                runner = hadut.PipesRunner(prefix=PREFIX, logger=logger)
                logger.info("Running the script %s with data input %s..",
                            piped_code_file, data_input)
                data_input_path = os.path.join(DATASET_DIR, data_input)
                runner.set_input(data_input_path, put=False)
                runner.set_exe(pipes_code)
                runner.run(properties=CONF,
                           hadoop_conf_dir=HADOOP_CONF_DIR,
                           logger=logger)
                res = runner.collect_output()
                print data_input_path
                local_wc = pts.LocalWordCount(data_input_path)
                logging.info(local_wc.check(res))
                # print res
                # runner.clean()
            results[data_input] = (t.secs, t.msecs)

    print "\n\n RESULTs"
    print "=" * (len(piped_code_file) + 15)
    print " *  script: {0}".format(piped_code_file)
    print " *  mappers: {0}".format(CONF["mapred.map.tasks"])
    print " *  reducers: {0}".format(CONF["mapred.reduce.tasks"])
    print " *  dataset: [{0}]".format(",".join(dataset))
    print " *  times (input -> secs):"
    for data_input in dataset:
        print "    - {0} -> {1} secs.".format(data_input,
                                              results[data_input][0])
    print "\n => Total execution time: {0}".format(total_time.secs)
    print "=" * (len(piped_code_file) + 15)
    print "\n"
Ejemplo n.º 34
0
def send_file(file):
    print("Saving to HDFS")

    dest = 'hdfs://localhost:9000/Task-002/python_output.txt'
    hdfs.put(file, dest)
    print("Saved to HDFS")
Ejemplo n.º 35
0
    for subdir, dirs, files in os.walk(localInputDirPath):
        dirs.sort()
        files.sort()
        for file in files:
            filePath = os.path.join(subdir, file)
            if (filePath.endswith(('.jpg', '.tiff', '.tif', '.png', '.JPG',
                                   '.TIFF', '.TIF', '.PNG'))
                    and os.path.getsize(filePath) > 0):
                flattenedPath = subdir.replace("/", "_")
                if flattenedPath.startswith('_'):
                    flattenedPath = flattenedPath[1:]
                hdfsFileName = flattenedPath + file
                hdfs_path = hdfsOutputDirPath + hdfsFileName

                try:
                    hdfs.put(filePath, hdfs_path)
                    imageCount += 1
                    print '[' + str(
                        imageCount
                    ) + '] file: ' + hdfsFileName + ' ===> ' + hdfs_path + '   Size = ' + str(
                        os.path.getsize(filePath))
                    #os.remove(filePath)
                except IOError:
                    #os.remove(filePath)
                    continue

    print '======================================================================='
    print '= SUCCESS: All files successifully moved from local folder to HDFS    ='
    print '======================================================================='
Ejemplo n.º 36
0
#coding:UTF-8
import pydoop.hdfs
import pydoop.hdfs as hdfs
from_path = '/tmp/cctv/abc.txt'
to_path = 'hdfs://localhost:22/tmp/outfile.txt'
hdfs.put(from_path, to_path)
Ejemplo n.º 37
0
if b == True:
    hdfs_client = hdfs.hdfs()
    data_list = hdfs_client.list_directory('/data')
    print(data_list)

    for item in data_list:
        print(item['name'])
        if '2020-12-28_generated_demo.csv' in item['name']:
            print('rm -->', item['name'])
            hdfs.rm(item['name'], recursive=True, user=None)

    print('---after rm ---')
    data_list = hdfs_client.list_directory('/data')
    print(data_list)

    print('---get test ---')
    lines = []
    with hdfs.open('hdfs://127.0.0.1:9000/data/source_demo.csv') as f:
        for line in f:
            # print(line, type(line))
            l = line.decode('utf-8')
            if '2020-11-15' in l:
                lines.append(l)
    print(lines)
    print('---end get----')

    hdfs.put('2021-02-09_generated_demo.csv', '/data')
    print('---after put ---')
    data_list = hdfs_client.list_directory('/data')
    print(data_list)
Ejemplo n.º 38
0
import pydoop.hdfs as hdfs
import pydoop
from datetime import datetime

print datetime.now().time()

pydoop.hdfs.hdfs(host='default', port=0, user=None, groups=None)

hdfs.mkdir('NEWS ARTICLES')
# hdfs.put('/home/hduser1/PycharmProjects/Crawler/NEWS.csv', 'NEWS ARTICLES/NEWS.csv')

var = hdfs.mkdir('ALL UBL DATA')
print var
hdfs.mkdir('ALL HBL DATA')
hdfs.mkdir('ALL OGDCL DATA')
hdfs.mkdir('ALL ENGRO DATA')
hdfs.mkdir('ALL PSO DATA')
hdfs.mkdir('MISC')

hdfs.put('/home/hduser1/PycharmProjects/Crawler/Unwanted Stuff',
         'MISC/Unwanted')
hdfs.put('/home/hduser1/PycharmProjects/Crawler/kse 100', 'HISTORICAL/')

hdfs.hdfs.delete("/user/hduser1/HISTORICAL", recursive=True)
Ejemplo n.º 39
0
import re
from pydoop import hdfs

counts = dict()

with open('/home/field/Desktop/Shakespeare.txt') as f:
    File = f.read().split()

    for items in File:

        items = items.lower()

        items = ''.join(re.findall('[a-zA-Z0-9@\s]+', items))

        if items in counts:
            counts[items] += 1
        else:
            counts[items] = 1

counts = sorted(counts.items(), key=lambda kv: (kv[1], kv[0]))

print(counts)

with open('listfile.txt', 'w') as f:
    for listitem in counts:
        f.write("  ".join(str(s) for s in listitem) + '\n')

hdfs_path = "hdfs://localhost:9000/SPtext/PythonResult.txt"
hdfs.put("listfile.txt", hdfs_path)
Ejemplo n.º 40
0
def convertGRIBs(aws_key, aws_secret_key, numprocesses, myremainder, compressed_flag = True, output_to_S3_flag = False):

    tempdir = "/mnt3/ubuntu" # for r3.8xlarge instances, assumes this is linked to one of the SSDs
    #tempdir = "/tmp" # for other instances

    conn = S3Connection(aws_key, aws_secret_key)

    # source of the CFSRO-O data, as a set of grb2 files
    bucket = conn.get_bucket('agittens')
    keys = bucket.list(prefix='CSFR-O/grib2/ocnh06.gdas') # should manually enforce a sorting on these so you know the explicit map between the record number and a particular sample observation

    # make these vectors global because they're huge, so don't want to reallocate them 
    dimsperlevel = 360*720
    dims = dimsperlevel * 41
    vals = np.zeros((dims,))
    mask = np.zeros((dims,)) < 0

    # returns the set of gribs from an s3 key 
    tempgribfname = tempdir + '/temp{0}'.format(myremainder)
    def get_grib_from_key(inkey):
        with open(tempgribfname, 'w') as fin:
            inkey.get_file(fin)
        return pygrib.open(tempgribfname.format(myremainder))

    # index of the gribs within the grib file that correspond to SST and sub surface sea temperatures
    gribindices = range(1,41)
    gribindices.append(207)
    gribindices = list(reversed(gribindices))

    # for a given set of gribs, extracts the desired temperature observations and converts them to a vector of
    # observations and drops missing observations
    def converttovec(grbs):

        for index in range(41):
            maskedobs = grbs[gribindices[index]].data()[0]
            vals[index*dimsperlevel:(index+1)*dimsperlevel] = maskedobs.data.reshape((dimsperlevel,))
            mask[index*dimsperlevel:(index+1)*dimsperlevel] = maskedobs.mask.reshape((dimsperlevel,))
            
        return vals[mask == False]

    # prints a given status message with a timestamp
    def report(status):
        print datetime.now().time().isoformat() + ":\t" + status

    # convenience function so can write to a compressed or uncompressed file transparently
    if compressed_flag:
        myopen = gzip.open
    else:
        myopen = open
        
    error_fh = open('grib_conversion_error_log_{0}_of_{1}'.format(myremainder, numprocesses), 'w')

    recordDateMapping = {}
    mappingfname = "CFSROcsv/recordDateMapping/part-" + format(myremainder, "05")
    if compressed_flag:
        mappingfname += ".gz"

    for (recordnum, inkey) in enumerate(keys):
        # only process records assigned to you
        if (recordnum % numprocesses) is not myremainder:
            continue
        recordDateMapping[recordnum] = inkey.name.split('.')[2]

        # choose the right name for the vector of observations and the vector of masks
        # depending on whether or not they're compressed
        valsfname = "CFSROcsv/vals/part-"+format(recordnum,"05")
        if compressed_flag:
            valsfname += ".gz"
            
        # avoid processing this set of observations if it has already been converted
        if output_to_S3_flag:
            possible_key = bucket.get_key(valsfname)
            if possible_key is not None:
                report("{0} already converted to csv, skipping record {1}".format(inkey.name, recordnum))
                continue
        else:
            if hdfs.path.isfile(valsfname):
                report("{0} already converted to csv, skipping record {1}".format(inkey.name, recordnum))
                continue
                
        # convert the observations and write them out to HDFS/S3 compressed/uncompressed
        try:
            grbs = get_grib_from_key(inkey)
            report("Retrieved {0} from S3".format(inkey.name))
        
            observations = converttovec(grbs)
            report("Converted {0} to a numpy array of observations".format(inkey.name))
        
            tempvalsfname = tempdir + '/tempvals{0}'.format(myremainder)
            with myopen(tempvalsfname, 'w') as valsfout:
	        for index in range(0, observations.shape[0]):
		    valsfout.write("{0},{1},{2}\n".format(recordnum, index, observations[index]))
            report("Wrote numpy array to a local file")
            
            if output_to_S3_flag:
                valsoutkey = Key(bucket)
                valsoutkey.key = valsfname
                valsoutkey.set_contents_from_filename(tempvalsfname)
                report("Wrote {0} to {1} on S3".format(inkey.name.split('.')[2], valsfname))
            else:
                hdfs.put(tempvalsfname, valsfname)
                report("Wrote {0} to {1} on HDFS".format(inkey.name.split('.')[2], valsfname))
        except:
            report("Skipping record {0}! An error occurred processing {1}".format(recordnum, inkey.name))
            error_fh.write("Skipped {1}, record {0}\n".format(inkey.name, recordnum))
            
    try:
        os.remove(tempgribfname)
        os.remove(tempvalsfname)

        # write the record mapping out to file so we know which rows correspond to which date
    	temprecordfname = tempdir + '/temp{0}recordmapping'.format(myremainder)
        with myopen(temprecordfname, 'w') as fout:
            for recordnum, keyname in recordDateMapping.iteritems():
                fout.write("{0},{1}\n".format(recordnum, keyname))
        report("Wrote the observation date to row number mapping for process {0} to local file".format(myremainder))
             
        if output_to_S3_flag:
            mappingoutkey = Key(bucket)
            mappingoutkey.key = mappingfname
            mappingoutkey.set_contents_from_filename(temprecordfname)
            report("Wrote record mapping for {0} to {1} on S3".format(myremainder, mappingfname))
        else:
            hdfs.put(temprecordfname, mappingfname)
            report("Wrote record mapping for {0} to {1} on HDFS".format(myremainder, mappingfname))

        os.remove(temprecordfname)
    except:
        report("Skipping writing the record mapping for {0}! An error occurred writing it out.".format(myremainder))
        error_fh.write("Skipping writing the record mapping for {0}! An error occurred writing it out.\n".format(myremainder))

    error_fh.close()