Ejemplos de Client en Python, ejemplos de snakebite.client.Client en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: effective_user_test.py Proyecto: 3rwww1/snakebite

class EffectiveUserTest(MiniClusterTestBase):
    ERR_MSG_TOUCH = "org.apache.hadoop.security.AccessControlException\nPermission denied: user=__foobar"
    ERR_MSG_STAT = "`/foobar2': No such file or directory"

    VALID_FILE = '/foobar'
    INVALID_FILE = '/foobar2'

    def setUp(self):
        self.custom_client = Client(self.cluster.host, self.cluster.port)
        self.custom_foobar_client = Client(host=self.cluster.host,
                                           port=self.cluster.port,
                                           effective_user='******')

    def test_touch(self):
        print tuple(self.custom_client.touchz([self.VALID_FILE]))
        try:
            tuple(self.custom_foobar_client.touchz([self.INVALID_FILE]))
	except Exception, e:
            self.assertTrue(e.message.startswith(self.ERR_MSG_TOUCH))

        self.custom_client.stat([self.VALID_FILE])
        try:
            self.custom_client.stat([self.INVALID_FILE])
        except Exception, e:
            self.assertEquals(e.message, self.ERR_MSG_STAT)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: hdfs.py Proyecto: lexman/tuttle

 def signature(self):
     client = Client(self._host, self._port, effective_user=self._user, use_trash=False)
     stats = client.stat([self._partial])
     if stats['file_type'] == 'f':
         return "modification_time:{}".format(stats['modification_time'])
     else:
         return stats['file_type']

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test.py Proyecto: alfonsokim/scripts

def test():
    """
    """
    client = Client("192.168.99.100", 9000)
    for f in client.ls(['/files']):
        print f
        for line in client.cat([f.get('path')]):
            for l in line:
                print l

Ejemplo n.º 4

0

Mostrar archivo

Archivo: monitor.py Proyecto: Netflix/inviso

    def run(self):
        c = Client(self.host, self.port)

        listing = c.ls([self.log_path], recurse=True)
        events = []
        for f in listing:
            path = f['path']

            if not path.endswith('.jhist'):
                continue

            ts = arrow.get(f['modification_time']/1000)

            if ts <= self.checktime:
                continue

            job_id = job_pattern.match(path.split('/')[-1]).group(0)

            if job_id in self.jobs and self.jobs[job_id] >= ts.timestamp*1000:
                log.debug('Skipping processed job: ' + job_id)
                continue

            config_path = path[:path.rfind('/')]+'/'+job_id+'_conf.xml'

            event = {
                'inviso.type': 'mr2',
                'job.id': job_id,
                'application.id': job_id.replace('job_', 'application_'),
                'job.type': 'mr2',
                'file.type': ['history', 'config'],
                'jobflow' : self.jobflow,
                'cluster.id': self.cluster_id,
                'cluster': self.cluster_name,
                'history.uri': 'hdfs://%s:%s%s' % (self.host,self.port,path),
                'config.uri':'hdfs://%s:%s%s' % (self.host,self.port,config_path),
                'host': self.host,
                'port': self.port,
                'timestamp': str(ts),
                'epoch': f['modification_time'],
                'mapreduce.version': 'mr2'
            }

            log.info('Publishing event: (%s) %s %s' % (event['cluster'], event['job.id'], ts))
            events.append(event)
        for chunk in [events[i:i + self.chunk_size] for i in xrange(0, len(events), self.chunk_size)]:
            self.publisher.publish(chunk)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: helper.py Proyecto: CSC-IT-Center-for-Science/spark-analysis

def delete_item(config, filepath='', localpath=''):

    if(config['BACKEND'] == 'hdfs'):
        client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False)
        del_gen = client.delete([filepath], recurse=True)
        for del_item in del_gen:
            pass
    elif(config['BACKEND'] == 'swift'):
        pass  # To be implemented

    # Deleting modules or datasets from local directories (will also suffice for nfs backend)
    if(os.path.isdir(localpath)):  # Check if it is a dataset
        shutil.rmtree(localpath)
    else:
        try:
            os.remove(localpath)
        except OSError:
            pass

Ejemplo n.º 6

0

Mostrar archivo

Archivo: crfminimal.py Proyecto: cjsanjay/dig-crf

def crfalign(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True):

    # crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    # def cpath(n):
    #     return os.path.join(crfConfigDir, n)

    # smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"),
    #                            config_path=cpath("eyeColor_config.txt"))
    # smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"),
    #                             config_path=cpath("hairColor_config.txt"))
    # print smEyeColor, smHairColor

    if location == "hdfs":
        if deleteFirst:
            namenode = "memex-nn1"
            port = 8020
            client = Client(namenode, 8020, use_trash=True)
            try:
                for deleted in client.delete([outputDirectory], recurse=True):
                    print deleted
            except FileNotFoundException as e:
                pass

    # hypothesis1: data fetched this way prompts the lzo compression error
    # hypothesis2: but it doesn't matter, error is just a warning
    rdd_crfl = sc.textFile(inputFilename)
    rdd_crfl.setName('rdd_crfl')

    if limit:
        rdd_crfl = sc.parallelize(rdd_crfl.take(limit))
    if partitions:
        rdd_crfl = rdd_crfl.repartition(partitions)

    rdd_final = rdd_crfl
    print outputFormat
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        print "saving to %s" % outputDirectory
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: hdfs_loader.py Proyecto: 2php/veles

 def __init__(self, workflow, **kwargs):
     super(HDFSTextLoader, self).__init__(workflow, **kwargs)
     self.file_name = kwargs["file"]
     self.chunk_lines_number = kwargs.get("chunk", 1000)
     client_kwargs = dict(kwargs)
     del client_kwargs["file"]
     if "chunk" in kwargs:
         del client_kwargs["chunk"]
     self.hdfs_client = Client(**client_kwargs)
     self.output = [""] * self.chunk_lines_number
     self.finished = Bool()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: processStream.py Proyecto: patng323/w205-course-project

def getTrainedModel(hdfsServer, modelFile):
    hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
    modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + modelFile + '/'

    # Load the saved model data
    hdfs_client = Client(hdfsServer, hdfsPort)
    filesInfo = hdfs_client.ls([modelSavePath])

    # Copy HDFS files to local temp directory
    # First clean up and recreate the temp folder
    copyDir = tempfile.gettempdir() + "/" + modelFile
    shutil.rmtree(copyDir, ignore_errors=True)
    os.makedirs(copyDir)
    res = hdfs_client.copyToLocal([f['path'] for f in filesInfo], copyDir)
    for r in res:
        if not r['result']:
            print "Error: %s" % r

    modelFilePath = copyDir + '/' + modelFile
    print "Load model from  %s" % modelFilePath
    return joblib.load(modelFilePath)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: helper.py Proyecto: CSC-IT-Center-for-Science/spark-analysis

def getObjsBackend(objs, backend, config):

    if(backend == 'hdfs'):

        client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False)

        for obj in objs:
                try:
                    copy_gen = client.copyToLocal([obj[0]], obj[1])
                    for copy_item in copy_gen:
                        pass
                except Exception as e:
                        print(e)
    elif(backend == 'swift'):

        options = {'os_auth_url': os.environ['OS_AUTH_URL'], 'os_username': os.environ['OS_USERNAME'], 'os_password': os.environ['OS_PASSWORD'], 'os_tenant_id': os.environ['OS_TENANT_ID'], 'os_tenant_name': os.environ['OS_TENANT_NAME']}
        swiftService = SwiftService(options=options)

        for obj in objs:

            # Create the containers which are used in this application for Object Storage
            if(obj[0] == 'sqlite.db'):
                swiftService.post(container='containerFiles')
                swiftService.post(container='containerFeatures')
                swiftService.post(container='containerModules')

            out_file = obj[1]  # Get the output file location from runner
            localoptions = {'out_file': out_file}
            objects = []
            objects.append(obj[0])
            swiftDownload = swiftService.download(container='containerModules', objects=objects, options=localoptions)

            for downloaded in swiftDownload:
                if("error" in downloaded.keys()):
                    raise RuntimeError(downloaded['error'])
                # print(downloaded)

    elif(backend == 'nfs'):  # Every file is already in respective local dirs
        pass

Ejemplo n.º 10

0

Mostrar archivo

 def get_conn(self):
     '''
     Returns a snakebite HDFSClient object.
     '''
     connections = self.get_connections(self.hdfs_conn_id)
     client = None
     if len(connections) == 1:
         client = Client(connections[0].host, connections[0].port)
     elif len(connections) > 1:
         nn = [Namenode(conn.host, conn.port) for conn in connections]
         client = HAClient(nn)
     else:
         raise HDFSHookException("conn_id doesn't exist in the repository")
     return client

Ejemplo n.º 11

0

Mostrar archivo

Archivo: hdfs.py Proyecto: vipadm/airflow

    def get_conn(self) -> Any:
        """Returns a snakebite HDFSClient object."""
        # When using HAClient, proxy_user must be the same, so is ok to always
        # take the first.
        effective_user = self.proxy_user
        autoconfig = self.autoconfig
        use_sasl = conf.get('core', 'security') == 'kerberos'

        try:
            connections = self.get_connections(self.hdfs_conn_id)

            if not effective_user:
                effective_user = connections[0].login
            if not autoconfig:
                autoconfig = connections[0].extra_dejson.get(
                    'autoconfig', False)
            hdfs_namenode_principal = connections[0].extra_dejson.get(
                'hdfs_namenode_principal')
        except AirflowException:
            if not autoconfig:
                raise

        if autoconfig:
            # will read config info from $HADOOP_HOME conf files
            client = AutoConfigClient(effective_user=effective_user,
                                      use_sasl=use_sasl)
        elif len(connections) == 1:
            client = Client(
                connections[0].host,
                connections[0].port,
                effective_user=effective_user,
                use_sasl=use_sasl,
                hdfs_namenode_principal=hdfs_namenode_principal,
            )
        elif len(connections) > 1:
            name_node = [
                Namenode(conn.host, conn.port) for conn in connections
            ]
            client = HAClient(
                name_node,
                effective_user=effective_user,
                use_sasl=use_sasl,
                hdfs_namenode_principal=hdfs_namenode_principal,
            )
        else:
            raise HDFSHookException(
                "conn_id doesn't exist in the repository and autoconfig is not specified"
            )

        return client

Ejemplo n.º 12

0

Mostrar archivo

def process_worker(queue):
    client = Client("trevally.amer.nevint.com",
                    9000,
                    use_trash=False,
                    effective_user='******')

    while True:
        afile = queue.get()
        print afile
        try:
            process(client, afile)
        except Exception as e:
            print e
        finally:
            queue.task_done()

Ejemplo n.º 13

0

Mostrar archivo

Archivo: hdfs_loader.py Proyecto: zghzdxs/veles

class HDFSTextLoader(Unit, TriviallyDistributable):
    def __init__(self, workflow, **kwargs):
        super(HDFSTextLoader, self).__init__(workflow, **kwargs)
        self.file_name = kwargs["file"]
        self.chunk_lines_number = kwargs.get("chunk", 1000)
        client_kwargs = dict(kwargs)
        del client_kwargs["file"]
        if "chunk" in kwargs:
            del client_kwargs["chunk"]
        self.hdfs_client = Client(**client_kwargs)
        self.output = [""] * self.chunk_lines_number
        self.finished = Bool()

    def initialize(self):
        self.debug("Opened %s", self.hdfs_client.stat([self.file_name]))
        self._generator = self.hdfs_client.text([self.file_name])

    def run(self):
        assert not self.finished
        try:
            for i in range(self.chunk_lines_number):
                self.output[i] = next(self._generator)
        except StopIteration:
            self.finished <<= True

Ejemplo n.º 14

0

Mostrar archivo

Archivo: data_generator-v2.py Proyecto: saulam/daloflow

    def __init__(self,
                 height=28,
                 width=28,
                 channels=1,
                 batch_size=32,
                 images_uri='/',
                 shuffle=True):
        'Initialization'
        self.height = height
        self.width = width
        self.channels = channels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.images_uri = images_uri

        o = urlparse(self.images_uri)
        if o.scheme == 'hdfs':
            self.images_path = o.path
            self.client = Client(
                o.hostname, o.port
            )  # images_uri: 'hdfs://10.0.40.19:9600/daloflow/dataset32x32/'
        else:
            self.images_path = images_uri
            self.client = None

Ejemplo n.º 15

0

Mostrar archivo

Archivo: gen_timestamp.py Proyecto: blueskywalker/junkyard

def main(queue):
    client = Client(host, port, use_trash=False, effective_user='******')

    def find_minutes(path, level, result):
        for x in client.ls([path]):
            if level < 5:
                find_minutes(x['path'], level + 1, result)
            else:
                result.append(x['path'])

    min_list = []
    find_minutes('/data/hub/vehicle/MKZ-Grey/2017/08/31', 4, min_list)
    for each in min_list:
        print each
        queue.put(each)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: hdfsstat.py Proyecto: imsid/kickstarter

class HDFSStat(object):

    cluster = 'hostname'
    port = 8020
    default_path = '/user/hive/warehouse'

    @staticmethod
    def build_path(table):
        nm = table.split('.')[0]
        tb = table.split('.')[1]
        return default_path + '/' + nm + '.db/' + tb

    def __init__(self):
        self.client = Client(HDFSStat.cluster, HDFSStat.port, use_trash=False)

    def latest_partition(self, table_name, table_path=None):
        t_path = HDFSStat.build_path(table_name) if table_path is None else table_path
        latest_dir = list(self.client.ls([t_path])).pop()
        return path.basename(latest_dir['path']).split('=')[1]

    def poke_partition(self, table_name, partition_name, partition, table_path=None):
        t_path = HDFSStat.build_path(table_name) if table_path is None else table_path
        partition_path = t_path + '/' + partition_name + '=' + partition
        return self.client.test(partition_path, exists=True, directory=True, zero_length=False)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: hdfs_loader.py Proyecto: 2php/veles

class HDFSTextLoader(Unit, TriviallyDistributable):
    def __init__(self, workflow, **kwargs):
        super(HDFSTextLoader, self).__init__(workflow, **kwargs)
        self.file_name = kwargs["file"]
        self.chunk_lines_number = kwargs.get("chunk", 1000)
        client_kwargs = dict(kwargs)
        del client_kwargs["file"]
        if "chunk" in kwargs:
            del client_kwargs["chunk"]
        self.hdfs_client = Client(**client_kwargs)
        self.output = [""] * self.chunk_lines_number
        self.finished = Bool()

    def initialize(self):
        self.debug("Opened %s", self.hdfs_client.stat([self.file_name]))
        self._generator = self.hdfs_client.text([self.file_name])

    def run(self):
        assert not self.finished
        try:
            for i in range(self.chunk_lines_number):
                self.output[i] = next(self._generator)
        except StopIteration:
            self.finished <<= True

Ejemplo n.º 18

0

Mostrar archivo

Archivo: loader.py Proyecto: grundprinzip/pyxplorer

    def __init__(self, path, name_node, hive_server,
                 user="******", hive_db="default", password=None, nn_port=8020, hive_port=10000):

        # HDFS Connection
        self._client = Client(name_node, nn_port)

        self._db = hive_db

        # Hive Connection
        self._hive = pyhs2.connect(host=hive_server,
                                   port=hive_port,
                                   authMechanism="PLAIN",
                                   database=hive_db,
                                   user=user,
                                   password=password)
        self._path = path

Ejemplo n.º 19

0

Mostrar archivo

Archivo: snappymerge.py Proyecto: agrebin/snappymerge

	def __init__(self,topic,user,server,port,web_port,base,hdfs_tmp):
		self.topic = topic
		self.username = user
		self.server = server
		self.port = port
		self.base = base
		self.path = ["%s/%s" % (base,topic)]
		self.hdfs_tmp = hdfs_tmp

		try:
			self.client=Client(server,port,effective_user=user) 
			self.hdfsclient=hdfs.client.InsecureClient(\
                 "http://%s:%d" % (server,web_port),user=user)
	 		self.daylist=self.check()
		except:
	 		print "Base path %s does not contain valid structure" % (base)
	 		raise

Ejemplo n.º 20

0

Mostrar archivo

Archivo: hdfs_hook.py Proyecto: zhjchen/airflow

    def get_conn(self):
        '''
        Returns a snakebite HDFSClient object.
        '''
        use_sasl = False
        if conf.get('core', 'security') == 'kerberos':
            use_sasl = True

        connections = self.get_connections(self.hdfs_conn_id)
        client = None
        if len(connections) == 1:
            client = Client(connections[0].host,
                            connections[0].port,
                            use_sasl=use_sasl)
        elif len(connections) > 1:
            nn = [Namenode(conn.host, conn.port) for conn in connections]
            client = HAClient(nn, use_sasl=use_sasl)
        else:
            raise HDFSHookException("conn_id doesn't exist in the repository")
        return client

Ejemplo n.º 21

0

Mostrar archivo

 def get_bite(self):
     """
     If Luigi has forked, we have a different PID, and need to reconnect.
     """
     if self.pid != os.getpid() or not self._bite:
         client_kwargs = dict(filter(lambda k_v: k_v[1] is not None and k_v[1] != '', {
             'hadoop_version': self.config.getint("hdfs", "client_version", None),
             'effective_user': self.config.get("hdfs", "effective_user", None)
         }.iteritems()))
         if self.config.getboolean("hdfs", "snakebite_autoconfig", False):
             """
             This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well.
             This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode.
             The behaviour is the same as Client.
             """
             from snakebite.client import AutoConfigClient
             self._bite = AutoConfigClient(**client_kwargs)
         else:
             from snakebite.client import Client
             self._bite = Client(self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port"), **client_kwargs)
     return self._bite

Ejemplo n.º 22

0

Mostrar archivo

Archivo: hdfs_hook.py Proyecto: maartenor/airflow

    def get_conn(self):
        '''
        Returns a snakebite HDFSClient object.
        '''
        connections = self.get_connections(self.hdfs_conn_id)

        use_sasl = False
        if configuration.get('core', 'security') == 'kerberos':
            use_sasl = True

        client = None
        ''' When using HAClient, proxy_user must be the same, so is ok to always take the first '''
        effective_user = self.proxy_user or connections[0].login
        if len(connections) == 1:
            autoconfig = connections[0].extra_dejson.get('autoconfig', False)
            if autoconfig:
                client = AutoConfigClient(effective_user=effective_user,
                                          use_sasl=use_sasl)
            else:
                hdfs_namenode_principal = connections[0].extra_dejson.get(
                    'hdfs_namenode_principal')
                client = Client(
                    connections[0].host,
                    connections[0].port,
                    effective_user=effective_user,
                    use_sasl=use_sasl,
                    hdfs_namenode_principal=hdfs_namenode_principal)
        elif len(connections) > 1:
            hdfs_namenode_principal = connections[0].extra_dejson.get(
                'hdfs_namenode_principal')
            nn = [Namenode(conn.host, conn.port) for conn in connections]
            client = HAClient(nn,
                              effective_user=effective_user,
                              use_sasl=use_sasl,
                              hdfs_namenode_principal=hdfs_namenode_principal)
        else:
            raise HDFSHookException("conn_id doesn't exist in the repository")

        return client

Ejemplo n.º 23

0

Mostrar archivo

Archivo: stream_status_job_submit.py Proyecto: kevangel79/argo-streaming

def compose_hdfs_commands(year, month, day, args, config):
    # set up the hdfs client to be used in order to check the files
    namenode = config.get("HDFS", "namenode")
    client = Client(namenode.hostname, namenode.port, use_trash=False)

    # hdfs sync  path for the tenant

    hdfs_user = config.get("HDFS", "user")
    tenant = args.tenant
    hdfs_sync = config.get("HDFS", "path_sync")
    hdfs_sync = hdfs_sync.fill(namenode=namenode.geturl(),
                               hdfs_user=hdfs_user,
                               tenant=tenant).geturl()

    # dictionary holding all the commands with their respective arguments' name
    hdfs_commands = dict()

    # file location of metric profile (local or hdfs)
    hdfs_commands["--sync.mps"] = date_rollback(
        hdfs_sync + "/" + args.report + "/" + "metric_profile_" + "{{date}}" +
        ".avro", year, month, day, config, client)

    # file location of operations profile (local or hdfs)
    hdfs_commands["--sync.ops"] = hdfs_check_path(
        hdfs_sync + "/" + args.tenant + "_ops.json", client)

    # file location of aggregations profile (local or hdfs)
    hdfs_commands["--sync.apr"] = hdfs_check_path(
        hdfs_sync + "/" + args.tenant + "_" + args.report + "_ap.json", client)

    #  file location of endpoint group topology file (local or hdfs)
    hdfs_commands["-sync.egp"] = date_rollback(
        hdfs_sync + "/" + args.report + "/" + "group_endpoints_" + "{{date}}" +
        ".avro", year, month, day, config, client)

    return hdfs_commands

Ejemplo n.º 24

0

Mostrar archivo

def compose_hdfs_commands(year, month, day, args, config):

    # set up the hdfs client to be used in order to check the files
    namenode = config.get("HDFS", "namenode")
    client = Client(namenode.hostname, namenode.port, use_trash=False)

    # hdfs sync  path for the tenant

    hdfs_user = config.get("HDFS", "user")
    tenant = args.tenant
    hdfs_sync = config.get("HDFS", "path_sync")
    hdfs_sync = hdfs_sync.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl()

    hdfs_metric = config.get("HDFS", "path_metric")

    hdfs_metric = hdfs_metric.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl()

    # dictionary holding all the commands with their respective arguments' name
    hdfs_commands = dict()

    # file location of previous day's metric data (local or hdfs)
    hdfs_commands["--pdata"] = hdfs_check_path(
        hdfs_metric + "/" + str(datetime.date(year, month, day) - datetime.timedelta(1)), client)

    # file location of target day's metric data (local or hdfs)
    hdfs_commands["--mdata"] = hdfs_check_path(hdfs_metric + "/" + args.date, client)

    # file location of report configuration json file (local or hdfs)
    hdfs_commands["--conf"] = hdfs_check_path(hdfs_sync + "/" + args.tenant+"_"+args.report+"_cfg.json", client)

    # file location of metric profile (local or hdfs)
    hdfs_commands["--mps"] = date_rollback(
        hdfs_sync + "/" + args.report + "/" + "metric_profile_" + "{{date}}" + ".avro", year, month, day, config,
        client)

    # file location of operations profile (local or hdfs)
    hdfs_commands["--ops"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_ops.json",  client)

    # file location of aggregations profile (local or hdfs)
    hdfs_commands["--apr"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_"+args.report+"_ap.json", client)

    if args.thresholds:
        # file location of thresholds rules file (local or hdfs)
        hdfs_commands["--thr"] = hdfs_check_path(
            os.path.join(hdfs_sync, "".join([args.tenant, "_", args.report, "_thresholds.json"])), client)

    #  file location of endpoint group topology file (local or hdfs)
    hdfs_commands["-egp"] = date_rollback(
        hdfs_sync + "/" + args.report + "/" + "group_endpoints_" + "{{date}}" + ".avro", year, month, day, config,
        client)

    # file location of group of groups topology file (local or hdfs)
    hdfs_commands["-ggp"] = date_rollback(hdfs_sync + "/" + args.report + "/" + "group_groups_" + "{{date}}" + ".avro",
                                          year, month, day, config, client)

    # file location of weights file (local or hdfs)
    hdfs_commands["--weights"] = date_rollback(hdfs_sync + "/" + args.report + "/weights_" + "{{date}}" + ".avro", year,
                                               month, day, config, client)

    # file location of downtimes file (local or hdfs)
    hdfs_commands["--downtimes"] = hdfs_check_path(
        hdfs_sync + "/" + args.report + "/downtimes_" + str(datetime.date(year, month, day)) + ".avro", client)

    # file location of recomputations file (local or hdfs)
    # first check if there is a recomputations file for the given date
    # recomputation lies in the hdfs in the form of
    # /sync/recomp_TENANTNAME_ReportName_2018-08-02.json
    if client.test(urlparse(hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json").path, exists=True):
        hdfs_commands["--rec"] = hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json"
    else:
        hdfs_commands["--rec"] = hdfs_check_path(hdfs_sync+"/recomp.json", client)

    return hdfs_commands

Ejemplo n.º 25

0

Mostrar archivo

from snakebite.client import Client
from constants import NAMENODE_PORT

client = Client('localhost', NAMENODE_PORT)
for p in client.mkdir(['/foo/bar', '/input'], create_parent=True):
    print p

Ejemplo n.º 26

0

Mostrar archivo

Archivo: update_profiles.py Proyecto: kevangel79/argo-streaming

class HdfsReader:
    """
    HdfsReader class

    Connects to an hdfs endpoint (namenode) and checks argo profile files stored there
    Uses a specific base path for determining argo file destinations
    """
    def __init__(self, namenode, port, base_path):
        """
        Initialized HdfsReader which is used to check/read profile files from hdfs
        Args:
            namenode: str. hdfs namenode host
            port: int. hdfs namenode port
            base_path: str. base path to  destination used for argo
        """
        self.client = Client(namenode, port)
        self.base_path = base_path

    def gen_profile_path(self, tenant, report, profile_type):
        """
        Generates a valid hdfs path to a specific profile
        Args:
            tenant: str. tenant to be used
            report: str. report to be used
            profile_type: str. profile_type (operations|reports|aggregations|thresholds)

        Returns:
            str: hdfs path

        """
        templates = dict()
        templates.update({
            'operations': '{0}_ops.json',
            'aggregations': '{0}_{1}_ap.json',
            'reports': '{0}_{1}_cfg.json',
            'thresholds': '{0}_{1}_thresholds.json',
            'recomputations': 'recomp.json'
        })

        sync_path = self.base_path.replace("{{tenant}}", tenant)
        filename = templates[profile_type].format(tenant, report)
        return os.path.join(sync_path, filename)

    def cat(self, tenant, report, profile_type):
        """
        Returns the contents of a profile stored in hdfs
        Args:
            tenant: str. tenant name
            report: str. report name
            profile_type: str. profile type (operations|reports|aggregations|thresholds)

        Returns:

        """
        path = self.gen_profile_path(tenant, report, profile_type)
        try:
            txt = self.client.cat([path])
            j = json.loads(txt.next().next())
            return j, True
        except FileNotFoundException:
            return None, False

    def rem(self, tenant, report, profile_type):
        """
        Removes a profile file that already exists in hdfs (in order to be replaced)
        Args:
            tenant: str. tenant name
            report: str. report name
            profile_type: str. profile type (operations|reports|aggregations|thresholds)

        Returns:

        """
        path = self.gen_profile_path(tenant, report, profile_type)

        try:
            self.client.delete([path]).next()
            return True
        except FileNotFoundException:
            return False

Ejemplo n.º 27

0

Mostrar archivo

    # pipe '|' is forbidden in wiki titles and would make a good delimiter
    out_str = page.encode('utf-8').strip(
    ) + "|" + date_str + pageview_str + daily_trend_str + weekly_trend_str + monthly_trend_str
    return out_str


# define spark context
conf = (SparkConf().setAppName("Wiki Page Views Trends").set(
    "spark.hadoop.validateOutputSpecs", "false"))
sc = SparkContext(conf=conf)

# set custom connection
if (run_mode == "hdfs" or out_mode == "hdfs"):
    # spotify's snakebite as hdfs client
    hdfs_client = Client(cfg.get("hdfs", "hdfs_master_hostname"),
                         9000,
                         use_trash=False)

if (run_mode == "swift" or out_mode == "swift"):
    swiftConf = sc._jsc.hadoopConfiguration()
    for key, value in SWIFT_DEFAULT_CONFIGS.items():
        swiftConf.set(key, value)

    swift_client = swift.Connection(user=swift_user,
                                    key=swift_key,
                                    authurl=swift_authurl)

# read list of files
src_files = []

if run_mode == "hdfs":

Ejemplo n.º 28

0

Mostrar archivo

Archivo: dataSender.py Proyecto: patng323/w205-course-project

# Create kafka client
print "Create kafka client to: %s" % args.kafka
kafka = KafkaClient(args.kafka + ':9092')
producer = SimpleProducer(kafka)

# Read testing data from hdfs
hdfsServer = args.hdfs
hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort)

topic = args.topic

from snakebite.client import Client
print "Reading input from HDFS: server=%s, port=%d" % (hdfsServer, hdfsPort)
client = Client(hdfsServer, hdfsPort)
data_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_test.txt"]).next()
label_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_test.txt"]).next()

samples = data_file.splitlines()
labels = label_file.splitlines()
test_data = zip(samples, labels)
random.shuffle(test_data) # Shuffle it

import random
import time
import itertools

def getActivityName(a):
    a = int(a)
    if a in range(1,7):

Ejemplo n.º 29

0

Mostrar archivo

Archivo: list_directory.py Proyecto: CedarLogic/HadoopWithPython

from snakebite.client import Client

client = Client('localhost', 9000)
for x in client.ls(['/']):
   print x

Ejemplo n.º 30

0

Mostrar archivo

Archivo: hdfs.py Proyecto: lexman/tuttle

 def exists(self):
     client = Client(self._host, self._port, effective_user=self._user, use_trash=False)
     return client.test(self._partial, exists=True)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: snappymerge.py Proyecto: agrebin/snappymerge

class HDFS_topic(object):
    def __init__(self, topic, user, server, port, web_port, base, hdfs_tmp):
        self.topic = topic
        self.username = user
        self.server = server
        self.port = port
        self.base = base
        self.path = ["%s/%s" % (base, topic)]
        self.hdfs_tmp = hdfs_tmp

        try:
            self.client = Client(server, port, effective_user=user)
            self.hdfsclient=hdfs.client.InsecureClient(\
                          "http://%s:%d" % (server,web_port),user=user)
            self.daylist = self.check()
        except:
            print "Base path %s does not contain valid structure" % (base)
            raise

    #
    # Check basic hdfs access and that directory format is appropiate
    # also builds datelist structure
    #
    def check(self):
        self.content = self.client.ls(self.path)
        ret = []
        for item in self.content:
            (head, tail) = os.path.split(item['path'])
            try:
                parse(tail, yearfirst=True, dayfirst=True)
                if item['file_type'] == 'd':
                    ret.append(tail)
                else:
                    print("WARNING: %s is not a directory, skipping\n" %
                          (item['path']))
            except:
                print("WARNING: %s is not in date format, skipping\n" % (tail))

        if len(ret) > 0:
            ret.sort(key=lambda x: datetime.strptime(x, "%Y-%m-%d"))
            return ret
        else:
            return false

    #
    # Give a date, check if that date is on the dirlist and return matching dir entry
    #
    def day_in_topic(self, date):
        for item in self.daylist:
            if parse(date) == parse(item):
                return item
        return False

    #
    # Check and validates date_from and date_to arguments
    #
    def check_date_range(self, date_from, date_to):
        if date_from:
            try:
                parse(date_from)
            except:
                raise ValueError("FATAL: start date (%s) invalid date format" %
                                 (date_from))

            if (parse(date_from) < parse(self.daylist[0])) or (
                    parse(date_from) > parse(self.daylist[-1])):
                raise ValueError(
                    "FATAL: start date (%s) not in range (%s ---> %s)" %
                    (date_from, self.daylist[0], self.daylist[-1]))
            else:
                ret_from = parse(date_from).strftime("%Y-%m-%d")
                while not self.day_in_topic(ret_from):
                    print "WARNING: start date %s not in topic %s, trying next day" % (
                        ret_from, self.topic)
                    ret_from = datetime.strftime(
                        (parse(ret_from) + timedelta(days=1)), "%Y-%m-%d")

                ret_from = self.day_in_topic(ret_from)

        else:
            ret_from = self.daylist[0]

        if date_to:
            try:
                parse(date_to)
            except:
                raise ValueError("FATAL: end date (%s) invalid date format" %
                                 (date_to))

            if (parse(date_to) < parse(self.daylist[0])) or (
                    parse(date_to) > parse(self.daylist[-1])):
                raise ValueError(
                    "FATAL: end date (%s) not in range (%s ---> %s)" %
                    (date_to, self.daylist[0], self.daylist[-1]))
            else:
                ret_to = parse(date_to).strftime("%Y-%m-%d")
        else:
            ret_to = self.daylist[-1]

        if (parse(ret_from) > parse(ret_to)):
            raise ValueError(
                "FATAL: start date (%s) must be <= end date (%s)" %
                (ret_from, ret_to))

        return (ret_from, ret_to)

    #
    #  Traverses the list of valid directories and merges each day
    #
    def merge(self, date_from="", date_to=""):
        day = ""
        try:
            (day, date_to) = self.check_date_range(date_from, date_to)
        except Exception as err:
            raise ValueError(err)

        print "INFO: Trying to merge %s from %s to %s\n" % (self.topic, day,
                                                            date_to)

        while (parse(day) <= parse(date_to)):
            if self.day_in_topic(day):
                self.merge_day(day)
            else:
                print "WARNING: %s is not on %s, skipping\n" % (day, self.path)

            day = datetime.strftime((parse(day) + timedelta(days=1)),
                                    "%Y-%m-%d")
            while not self.day_in_topic(day) and parse(day) <= parse(date_to):
                print "WARNING: %s not found in %s, trying next day" % (
                    day, self.topic)
                day = datetime.strftime((parse(day) + timedelta(days=1)),
                                        "%Y-%m-%d")

            day = self.day_in_topic(day)
            if not day:
                return

        return True

    #
    # Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything
    #
    def merge_day(self, date):
        print "INFO: processing ", date
        daytmp = "%s/snappymerge-%s-tmp" % (self.hdfs_tmp, date)
        daypath = ["%s/%s/%s/" % (self.base, self.topic, date)]
        #mergedfile="./%s-merged.snappy" % (date)
        mergedfile = "./%s-merged.snappy" % (datetime.strftime(
            datetime.now(), "%Y-%d-%m.%f"))
        day_files = [x['path'] for x in self.client.ls(daypath)]
        print "INFO: DAYPATH: ", daypath
        try:
            os.remove(mergedfile)
        except:
            pass

        if len([x for x in day_files if x.endswith('.snappy')]) <= 1:
            print "WARNING: %s does not have enough files to getmerge, skipping" % (
                date)
            return

        if [file for file in day_files if not file.endswith('.snappy')]:
            print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % (
                daypath, file, daytmp)
            self.merge_with_move(daypath[0], daytmp, day_files, mergedfile)
        else:
            print "INFO: MERGING ", daypath[0]
            result = self.client.getmerge(daypath[0], mergedfile)
            print[x for x in result if not x['result']]

            print "INFO: DELETING original files in ", daypath[0]
            for file in day_files:
                print "INFO: Deleting original file ", file
                self.hdfsclient.delete(file)

            print "INFO: UPLOADING merged (%s) to %s" % (mergedfile,
                                                         daypath[0])
            self.hdfsclient.upload(daypath[0], mergedfile, overwrite=True)
            os.remove(mergedfile)

        return


#
# When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads
#

    def merge_with_move(self, day_path, day_tmp, dayfiles, merged_file):
        self.hdfsclient.makedirs(day_tmp)

        print "INFO: MOVING files to ", day_tmp
        snap = [x for x in dayfiles if x.endswith(".snappy")]
        result = self.client.rename(snap, day_tmp)
        print[x['path'] for x in result if not x['result']]

        print "INFO: MERGING files in ", day_tmp
        result = self.client.getmerge(day_tmp, merged_file)
        print[x['path'] for x in result if not x['result']]

        print "INFO: UPLOADING merged (%s) to %s" % (merged_file, day_path)
        self.hdfsclient.upload(day_path, merged_file, overwrite=True)
        os.remove(merged_file)

        print "INFO: Deleting files on ", day_tmp
        self.hdfsclient.delete(day_tmp, recursive=True)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: trainModel.py Proyecto: patng323/w205-course-project

import argparse
import subprocess

parser = argparse.ArgumentParser()
parser.add_argument("--hdfs", help="HDFS FS name", default = 'localhost')
parser.add_argument("--model", help="Name of model file", default = 'belt.model')
args = parser.parse_args()


hdfsServer = args.hdfs
hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort)
modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + args.model + "/"
print "hdfs=%s, savePath=%s, hdfsHost=%s" % (hdfsServer, modelSavePath, hdfsHost)

hdfs_client = Client(hdfsServer, hdfsPort)

X_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_train.txt"]).next()
y_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_train.txt"]).next()

X_train = np.genfromtxt(str.splitlines(X_train_file))
y_train = np.genfromtxt(str.splitlines(y_train_file))

clf = LogisticRegression()
clf = clf.fit(X_train, y_train)

files = joblib.dump(clf, "belt.model")

subprocess.check_call(['hdfs', 'dfs', '-rm', '-r', '-f', modelSavePath], shell=False)
subprocess.check_call(['hdfs', 'dfs', '-mkdir', '-p', modelSavePath], shell=False)

Ejemplo n.º 33

0

Mostrar archivo

Archivo: move_files.py Proyecto: blueskywalker/junkyard

#!/usr/bin/env python

import os
from snakebite.client import Client


client = Client("trevally.amer.nevint.com", 9000, use_trash=False, effective_user='******')


#for res in client.mkdir(['/user/hadoop/test/move/file'],create_parent=True, mode=755):
#    print res

for res in client.rename(['/user/hadoop/test.tar'],'/user/hadoop/test3.tar'):
    print res

Ejemplo n.º 34

0

Mostrar archivo

Archivo: file_spark.py Proyecto: yudongjin/public

def is_exist(dirPath, master = public.SPARK_MASTER, port = public.SPARK_MASTER_PORT):
    client = Client(master, port, use_trash=False)
    return client.test(dirPath, exists=True, directory=True)

Ejemplo n.º 35

0

Mostrar archivo

Archivo: text.py Proyecto: CedarLogic/HadoopWithPython

from snakebite.client import Client

client = Client('localhost', 9000)
for l in client.text(['/input/input.txt']):
   print l

Ejemplo n.º 36

0

Mostrar archivo

Archivo: hdfs.py Proyecto: lexman/tuttle

 def remove(self):
     client = Client(self._host, self._port, effective_user=self._user, use_trash=False)
     it = client.delete([self._partial], recurse=True)
     for elmt in it:
         pass

Ejemplo n.º 37

0

Mostrar archivo

def get_locations(filename, name_host, name_port, data_root='/data/dfs/dn'):
    client = Client(name_host, name_port, use_trash=False)
    files = list(client.ls([filename]))
    return [pair for file in files for pair in find(file, client, data_root)]

Ejemplo n.º 38

0

Mostrar archivo

Archivo: test_hdfs_snakebite.py Proyecto: xtuyaowu/cabbage-celery

 def test_request(self):
     from snakebite.client import Client
     client = Client("10.0.137.24", 8022, use_trash=False)
     for x in client.ls(['/user']):
         print x

Ejemplo n.º 39

0

Mostrar archivo

Archivo: exporter.py Proyecto: v3nd3774/data_pipeline

 def health_check():
     c = Client("namenode", 8020)
     print "Checking for %s directory..." % sys.argv[1]
     for top_level in c.ls([sys.argv[1]]):
         print "DIR CHILD=%s" % top_level['path']
     print "Ok!"

Ejemplo n.º 40

0

Mostrar archivo

Archivo: main-mpi.py Proyecto: opadron/gobig-testbed

def main(args):
    xml = minidom.parse(path.join(os.environ["HADOOP_HOME"],
                                  "etc", "hadoop", "hdfs-site.xml"))

    element = [ x for x in xml.getElementsByTagName("property")
                if (x.getElementsByTagName("name")[0]
                     .childNodes[0]
                     .nodeValue == "dfs.namenode.http-address") ][0]

    namenode = (element.getElementsByTagName("value")[0]
                       .childNodes[0]
                       .nodeValue.split(":")[0])

    fs = HDFS(namenode, 8020)

    path_prefix = "/amplab/text"
    for size in args.sizes:
        timings = {}

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        file_list = None
        if c_rank == 0:
            file_list = [
                entry["path"] for entry in fs.ls([
                    path.join(path_prefix, size, "uservisits")])]
            file_list = [file_list[i::c_size] for i in range(c_size)]

        file_list = MPI.COMM_WORLD.scatter(file_list, root=0)

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["open-and-register"] = toc()

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        os_results = reduce_data(row_iterator(file_list, fs), 4, "os")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["q-stats-by-os"] = toc()
        if c_rank == 0: os_results.index = os_results.pop("os")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        browser_results = reduce_data(row_iterator(file_list, fs), 6, "browser")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["q-stats-by-browser"] = toc()
        if c_rank == 0: browser_results.index = browser_results.pop("browser")

        if c_rank == 0:
            top_dir = path.join("results", size, "mpi", str(args.nodes))
            mkdir_p(top_dir)
            with open(path.join(top_dir, "timings"), "w") as f:
                for entry in timings.items():
                    f.write("%s, %.18e\n" % entry)
                f.flush()

            browser_results.to_pickle(path.join(top_dir, "browser"))
            os_results.to_pickle(path.join(top_dir, "os"))

    return 0

Ejemplo n.º 41

0

Mostrar archivo

class DataGenerator(object):

    'Generates data for Keras'
    '''
    Initialization function of the class
    '''
    def __init__(self,
                 height=28,
                 width=28,
                 channels=1,
                 batch_size=32,
                 cache_mode='',
                 images_uri='/',
                 shuffle=True):
        'Initialization'
        self.debug = False
        self.height = height
        self.width = width
        self.channels = channels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.cache_mode = cache_mode
        self.images_uri = images_uri
        o = urlparse(self.images_uri)
        if o.scheme == 'hdfs':
            self.images_path = o.path
            self.client = Client(
                o.hostname, o.port
            )  # images_uri: 'hdfs://10.0.40.19:9600/daloflow/dataset32x32/'
        else:
            self.images_path = images_uri
            self.client = None

    '''
    Set debug mode True/False
    '''

    def set_debug(self, debug_mode):
        'Do not show or show messages'
        self.debug = debug_mode
        if self.debug == True:
            print(' * Debug mode:  ' + self.debug)
            print(' * Height:      ' + self.height)
            print(' * Width:       ' + self.width)
            print(' * Channels:    ' + self.channels)
            print(' * Batch_size:  ' + self.batch_size)
            print(' * Shuffle:     ' + self.shuffle)
            print(' * Cache mode:  ' + self.cache_mode)
            print(' * Image uri:   ' + self.images_uri)

    '''
    Goes through the dataset and outputs one batch at a time.
    '''

    def generate(self, labels, list_IDs, yield_labels=True):
        'Generates batches of samples'

        # Infinite loop
        while 1:
            # Generate random order of exploration of dataset (to make each epoch different)
            indexes = self.__get_exploration_order(list_IDs)

            # Generate batches
            imax = int(len(indexes) / self.batch_size)  # number of batches

            for i in range(imax):
                # Find list of IDs for one batch
                list_IDs_temp = [
                    list_IDs[k] for k in indexes[i * self.batch_size:(i + 1) *
                                                 self.batch_size]
                ]

                # Train, validation
                X, y = self.__data_generation(labels, list_IDs_temp,
                                              yield_labels)

                yield X, y

    '''
    Generates a random order of exploration for a given set of list_IDs.
    If activated, this feature will shuffle the order in which the examples
    are fed to the classifier so that batches between epochs do not look alike.
    Doing so will eventually make our model more robust.
    '''

    def __get_exploration_order(self, list_IDs):
        'Generates order of exploration'

        # Find exploration order
        indexes = np.arange(len(list_IDs))

        if self.shuffle == True:
            np.random.shuffle(indexes)

        return indexes

    '''
    Get data: local
    '''

    def __get_data_local(self, image_file_name):
        'Get data from local file system path'
        pixels = None

        try:
            with open(image_file_name, 'rb') as image_file:
                pixels = np.fromstring(zlib.decompress(image_file.read()),
                                       dtype=np.uint8,
                                       sep='').reshape(self.height, self.width,
                                                       self.channels)
        except:
            if self.debug == True:
                print('Exception ' + str(sys.exc_info()[0]) + ' on file ' +
                      image_file_name)

        return pixels

    '''
    Get data: remote
    '''

    def __get_data_remote(self, image_file_name):
        'Get data from HDFS'
        pixels = None
        if self.client == None:
            return pixels

        try:
            t = '/tmp/image.dat.' + str(os.getpid())
            if os.path.exists(t):
                os.remove(t)
            for f in self.client.copyToLocal([image_file_name], t):
                if f['result'] == True:
                    with open(t, 'rb') as image_file:
                        pixels = np.fromstring(
                            zlib.decompress(image_file.read()),
                            dtype=np.uint8,
                            sep='').reshape(self.height, self.width,
                                            self.channels)
                    os.remove(t)
                else:
                    print('File ' + f['path'] + ' NOT copied because "' +
                          str(f['error']) + '", sorry !')
        except:
            if self.debug == True:
                print('Exception ' + str(sys.exc_info()[0]) + ' on file ' +
                      image_file_name)

        return pixels

    '''
    Get data: local or remote
    '''

    def __get_data(self, image_file_name):
        'Get data: local or remote'
        pixels = None
        #print(' * image file name: ' + image_file_name)

        if self.cache_mode == 'hdfs2local' or self.cache_mode == 'hdfs2local-full':
            pixels = self.__get_data_local(image_file_name)
        elif self.cache_mode == 'nocache':
            pixels = self.__get_data_remote(image_file_name)
        elif self.cache_mode == 'hdfs2local-partial':
            pixels = self.__get_data_local(image_file_name)
            if pixels == None:
                pixels = self.__get_data_remote(image_file_name)
        else:
            print('ERROR: unknown "' + self.cache_mode + '" cache mode')

        return pixels

    '''
    Outputs batches of data and only needs to know about the list of IDs included
    in batches as well as their corresponding labels.
    '''

    def __data_generation(self, labels, list_IDs_temp, yield_labels):
        'Generates data of batch_size samples'  # X : (n_samples, v_size, v_size, v_size, n_channels)

        # Initialization
        X = np.empty((self.batch_size, self.height, self.width, self.channels),
                     dtype='float32')
        y = np.empty((self.batch_size), dtype='float32')

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Decompress image into pixel NumPy tensor
            image_file_name = self.images_path + '/'.join(
                ID.split('/')[1:]) + '.tar.gz'

            # Read image
            pixels = self.__get_data(image_file_name)

            # Store volume
            #pixels = np.rollaxis(pixels, 0, 3) # from 'channels_first' to 'channels_last'
            X[i, :, :, :] = pixels

            # get y value
            y_value = labels[ID]
            y[i] = y_value

        # return X and Y (train, validation)
        return X, y

    '''
    Please note that Keras only accepts labels written in a binary form
    (in a 6-label problem, the third label is writtten [0 0 1 0 0 0]),
    which is why we need the sparsify function to perform this task,
    should y be a list of numerical values.
    '''

    def sparsify1(self, y):
        'Returns labels in binary NumPy array'
        return np.array([[1 if y[i] == j else 0 for j in range(10)]
                         for i in range(y.shape[0])])

Ejemplo n.º 42

0

Mostrar archivo

Archivo: loader.py Proyecto: grundprinzip/pyxplorer

class Loader:
    """
    The idea of the loader is to provide a convenient interface to create a new table
    based on some input files
    """

    def __init__(self, path, name_node, hive_server,
                 user="******", hive_db="default", password=None, nn_port=8020, hive_port=10000):

        # HDFS Connection
        self._client = Client(name_node, nn_port)

        self._db = hive_db

        # Hive Connection
        self._hive = pyhs2.connect(host=hive_server,
                                   port=hive_port,
                                   authMechanism="PLAIN",
                                   database=hive_db,
                                   user=user,
                                   password=password)
        self._path = path


    def load(self):
        # Check data to see which kind it is
        files = self._client.ls([self._path])

        files = [f for f in files if f['file_type'] == 'f']
        if len(files) == 0:
            raise Exception("Cannot load empty directory")

        # Pick the first file and assume that it has the same content as the others
        data = self.head(files[0]['path'])
        res = self.check_separator(data)
        if res == None:
            # We cant load the data and better abort here
            print("cant load data, cannot find a separator")
            return

        sep = res[0]
        num_cols = res[1]

        # Build table statement
        table_statement, table_name = self._create_table(self._path, sep, num_cols)
        cursor = self._hive.cursor()
        cursor.execute(table_statement)

        return self._db, table_name


    def _create_table(self, path, sep, count):
        buf = """CREATE EXTERNAL TABLE pyxplorer_data (
    %s
    )ROW FORMAT DELIMITED FIELDS TERMINATED BY '%s'
    STORED AS TEXTFILE LOCATION '%s'
    """ % (",".join(["col_%d string" % x for x in range(count)]), sep, path)
        return buf, "pyxplorer_data"

    def check_separator(self, data):
        """
        THis method evaluates a list of separators on the input data to check which one
        is correct. This is done by first splitting the input by newline and then
        checking if the split by separator is equal for each input row except the last
        that might be incomplete due to the limited input data

        :param data: input data to check
        :return:
        """

        sep_list = [r'\t', r';', r',', r'|', r'\s+']

        data_copy = data
        for sep in sep_list:
            # Check if the count matches each line
            splitted = data_copy.split("\n")
            parts = [len(re.split(sep, line)) for line in splitted]

            # If we did not split anything continue
            if sum(parts) == len(splitted):
                continue

            diff = 0

            for i in range(len(parts[1:-1])):
                diff += abs(parts[i] - parts[i + 1])

            if diff == 0:
                return sep, parts[0]

        # If we reach this point we did not find a separator
        return None


    def head(self, file_path):
        """
        Onlye read the first packets that come, try to max out at 1024kb

        :return: up to 1024b of the first block of the file
        """
        processor = lambda path, node, tail_only=True, append=False: self._handle_head(
            path, node)

        # Find items and go
        for item in self._client._find_items([file_path], processor,
                                             include_toplevel=True,
                                             include_children=False, recurse=False):
            if item:
                return item

    def _handle_head(self, path, node, upper=1024 * 1024):
        data = ''
        for load in self._client._read_file(path, node, tail_only=False,
                                            check_crc=False):
            data += load
            if (len(data) > upper):
                return data

        return data

Ejemplo n.º 43

0

Mostrar archivo

Archivo: delete.py Proyecto: bopopescu/dev-1

from snakebite.client import Client
from constants import *

client = Client('localhost', NAMENODE_PORT)

for p in client.delete(['/foo/bar','/input'], recurse=True):
    print p

Ejemplo n.º 44

0

Mostrar archivo

import sys, string, getpass, time, datetime
import happybase
from snakebite.client import Client
import pprint
import urllib, json, ast, zlib, os

hdfs = Client("ip-172-31-17-255")
#for x in hdfs.ls(['/']):
#	print x


hbase = happybase.Connection('localhost')
hbase_settings_table = hbase.table('settings')


##get urls and add new ones if necessary
#TODO:uncomment below
# available_symbols_web = urllib.urlopen("http://api.bitcoincharts.com/v1/markets.json")
# available_symbols = json.loads(available_symbols_web.read())
# csv_settings_urls = hbase_settings_table.row('bitcoin_csv', columns=['urls'])
# known_symbols = [ key.split(':')[1] for key,val in csv_settings_urls.items() ]
# load_dict = {}
# for symbol in available_symbols:
# 	if (symbol['symbol'] not in known_symbols):
# 		load_dict['urls:' + symbol['symbol']] = str({'status':'',
# 												'symbol':symbol['symbol'],
# 												'url':'http://api.bitcoincharts.com/v1/trades.csv?symbol=' + symbol['symbol']})
#hbase_settings_table.put('bitcoin_csv', load_dict)

def get_csv_file(hadoop_path, symbol, url):
	csv_data = urllib.urlopen(url)

Ejemplo n.º 45

0

Mostrar archivo

def main(opts, args):
    hadoop_host = HADOOP_HOST
    hadoop_user_dir = None
    if opts.hdfs:
        print("hdfs enter")
        if opts.host:
            hadoop_host = opts.host
        hadoop_user_dir = opts.hdfs

    uni_gram_cnt = 0
    bi_gram_cnt = 0
    tri_gram_cnt = 0
    four_gram_cnt = 0
    five_gram_cnt = 0

    result_buffer = []
    source_input = None
    if not hadoop_user_dir:
        if len(args) > 2:
            source_input = sys.argv[1]
        else:
            source_input = sys.stdin

        for line in source_input:
            result_buffer.append(line)

            items = line.split()
            items_cnt = len(items)
            if items_cnt == 3: # 1-grams
                uni_gram_cnt +=1
            elif items_cnt == 4: #2-grams
                bi_gram_cnt += 1
            elif items_cnt == 5: #3-grams
                tri_gram_cnt += 1
            elif items_cnt == 6:
                four_gram_cnt += 1
            elif items_cnt == 7:
                five_gram_cnt +=1

    else:
        print "connect to haddoop"
        hadoop_client = Client(hadoop_host, 8020, use_trash=False)
        for g in hadoop_client.cat([os.path.join(hadoop_user_dir, "*.txt")]):
            for line in g:
                result_buffer.append(line)

                items = line.split()
                items_cnt = len(items)
                if items_cnt == 3: # 1-grams
                    uni_gram_cnt +=1
                elif items_cnt == 4: #2-grams
                    bi_gram_cnt += 1
                elif items_cnt == 5: #3-grams
                    tri_gram_cnt += 1
                elif items_cnt == 6:
                    four_gram_cnt += 1
                elif items_cnt == 7:
                    five_gram_cnt +=1


    print('\\data\\')
    if uni_gram_cnt != 0:
        print("ngram 1=%s" % uni_gram_cnt)

    if bi_gram_cnt != 0:
        print("ngram 2=%s" % bi_gram_cnt)

    if tri_gram_cnt != 0:
        print("ngram 3=%s" % tri_gram_cnt)

    if four_gram_cnt != 0:
        print("ngram 4=%s" % four_gram_cnt)

    if five_gram_cnt != 0:
        print("ngram 5=%s" % five_gram_cnt)

    result_iter = iter(result_buffer)
    print
    print_ngram(result_iter, 1, uni_gram_cnt)
    print
    print_ngram(result_iter, 2, bi_gram_cnt)
    print
    print_ngram(result_iter, 3, tri_gram_cnt)
    print
    print("\\end\\")

Ejemplo n.º 46

0

Mostrar archivo

Archivo: snappymerge.py Proyecto: agrebin/snappymerge

class HDFS_topic(object):
	def __init__(self,topic,user,server,port,web_port,base,hdfs_tmp):
		self.topic = topic
		self.username = user
		self.server = server
		self.port = port
		self.base = base
		self.path = ["%s/%s" % (base,topic)]
		self.hdfs_tmp = hdfs_tmp

		try:
			self.client=Client(server,port,effective_user=user) 
			self.hdfsclient=hdfs.client.InsecureClient(\
                 "http://%s:%d" % (server,web_port),user=user)
	 		self.daylist=self.check()
		except:
	 		print "Base path %s does not contain valid structure" % (base)
	 		raise	
	
	#
	# Check basic hdfs access and that directory format is appropiate
	# also builds datelist structure
	#
	def check(self):
		self.content=self.client.ls(self.path)
		ret=[]
		for item in self.content:
			(head,tail) = os.path.split(item['path'])
			try:
				parse(tail,yearfirst=True,dayfirst=True)
				if item['file_type'] == 'd':
					ret.append(tail)
				else:
					print("WARNING: %s is not a directory, skipping\n" % (item['path']))
			except:
				print("WARNING: %s is not in date format, skipping\n"  % (tail))

		if len(ret) > 0:
			ret.sort(key=lambda x: datetime.strptime(x,"%Y-%m-%d"))
			return ret
		else:
			return false

	#
	# Give a date, check if that date is on the dirlist and return matching dir entry
	#
	def day_in_topic(self, date):
		for item in self.daylist:
			if parse(date) == parse(item):
				return item
		return False


	#
	# Check and validates date_from and date_to arguments
	#	
	def check_date_range(self,date_from,date_to):
		if date_from:
			try:
				parse(date_from)
			except:
				raise ValueError("FATAL: start date (%s) invalid date format" % (date_from) )
		
			if ( parse(date_from)  < parse(self.daylist[0])  ) or ( parse(date_from)  > parse(self.daylist[-1]) ):
				raise ValueError("FATAL: start date (%s) not in range (%s ---> %s)" % (date_from,self.daylist[0],self.daylist[-1]))
			else:
				ret_from=parse(date_from).strftime("%Y-%m-%d") 
				while not self.day_in_topic(ret_from):
					print "WARNING: start date %s not in topic %s, trying next day" % (ret_from,self.topic)
					ret_from=datetime.strftime((parse(ret_from)+timedelta(days=1)), "%Y-%m-%d" )
					
				ret_from=self.day_in_topic(ret_from)

				
		else:
				ret_from=self.daylist[0]

		if date_to:
			try:
				parse(date_to)
			except:
				raise ValueError("FATAL: end date (%s) invalid date format" % (date_to) )

			if ( parse(date_to)  < parse(self.daylist[0])  ) or ( parse(date_to)  > parse(self.daylist[-1]) ):
				raise ValueError("FATAL: end date (%s) not in range (%s ---> %s)" % (date_to,self.daylist[0],self.daylist[-1]))
			else:
				ret_to=parse(date_to).strftime("%Y-%m-%d")
		else:
				ret_to=self.daylist[-1]
		
		if (parse(ret_from) > parse(ret_to) ):
			raise ValueError("FATAL: start date (%s) must be <= end date (%s)" % (ret_from,ret_to))


		return (ret_from,ret_to)
		
	
	#
	#  Traverses the list of valid directories and merges each day
	#
	def merge(self,date_from="",date_to=""):
		day=""
		try:
			(day,date_to)=self.check_date_range(date_from,date_to)
		except Exception as err:
			raise ValueError(err)

		print "INFO: Trying to merge %s from %s to %s\n" % (self.topic,day, date_to)

		while (parse(day) <= parse(date_to)):
			if  self.day_in_topic(day):
				self.merge_day(day)
			else:
				print "WARNING: %s is not on %s, skipping\n" % (day,self.path)

			day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" )
			while not self.day_in_topic(day) and parse(day) <= parse(date_to):
				print "WARNING: %s not found in %s, trying next day" % (day,self.topic)
				day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" )

			day=self.day_in_topic(day)
			if not day:
				return	
				
		return True

	#
	# Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything
	#
	def merge_day(self,date):
		print "INFO: processing ", date
		daytmp="%s/snappymerge-%s-tmp" % (self.hdfs_tmp,date)
		daypath=["%s/%s/%s/" % (self.base, self.topic,date)]
		#mergedfile="./%s-merged.snappy" % (date)
		mergedfile="./%s-merged.snappy" % (datetime.strftime(datetime.now(),"%Y-%d-%m.%f"))
		day_files=[x['path'] for x in self.client.ls(daypath)]
		print "INFO: DAYPATH: ", daypath
		try:
			os.remove(mergedfile)
		except:
			pass

		
		if len([ x for x in day_files if x.endswith('.snappy') ]) <= 1:
			print "WARNING: %s does not have enough files to getmerge, skipping" % (date)
			return

		if [ file for file in day_files if not file.endswith('.snappy') ]:
				print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % (daypath,file,daytmp)
				self.merge_with_move(daypath[0],daytmp,day_files,mergedfile)
		else:
			print "INFO: MERGING ", daypath[0]
			result=self.client.getmerge(daypath[0],mergedfile)
			print [x for x in result if not x['result']]
		
			print "INFO: DELETING original files in ", daypath[0]
			for file in day_files:
				print "INFO: Deleting original file ", file
				self.hdfsclient.delete(file)

			print "INFO: UPLOADING merged (%s) to %s" % (mergedfile,daypath[0])
			self.hdfsclient.upload(daypath[0],mergedfile,overwrite=True)
			os.remove(mergedfile)

		return

#
# When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads
#
	def merge_with_move(self,day_path,day_tmp,dayfiles,merged_file):
		self.hdfsclient.makedirs(day_tmp)


		print "INFO: MOVING files to ", day_tmp
		snap = [x for x in dayfiles if x.endswith(".snappy")]
		result=self.client.rename(snap,day_tmp)
		print [ x['path'] for x in result if not x['result']]

		print "INFO: MERGING files in ", day_tmp
		result=self.client.getmerge(day_tmp,merged_file)
		print [x['path'] for x in result if not x['result']]

		print "INFO: UPLOADING merged (%s) to %s"  % (merged_file,day_path)
		self.hdfsclient.upload(day_path,merged_file,overwrite=True)
		os.remove(merged_file)

		print "INFO: Deleting files on ", day_tmp
		self.hdfsclient.delete(day_tmp,recursive=True)


				
if __name__ == '__main__' :
	import argparse

	count=0

	parser = argparse.ArgumentParser(description="Merge daily historical snappy files into one to save hdfs space")
	parser.add_argument('topic', help="Topic name relative to --base")
	parser.add_argument('--hdfs_user', help="HDFS user name (default: current user)",default=None)
	parser.add_argument('--hdfs_server', help="HDFS server name or ip (default: aquhmstsys022001.c022.digitalriverws.net)",default="aquhmstsys022001.c022.digitalriverws.net")
	parser.add_argument('--hdfs_port', help="HDFS server port number (default:8020)", type=int, default=8020)
	parser.add_argument('--hdfs_tmp', help="HDFS temporary dir to store files to be merged (default:/user/hduser/tmp)", default="/user/hduser/tmp")
	parser.add_argument('--web_port', help="HDFS server WEB port number (default:50070)", type=int, default=50070)
	parser.add_argument('--base', help="Alternate hdfs base path for topic (default:/user/aqueduct/flume)",default="/user/aqueduct/flume")
	parser.add_argument('--start', help="Start Date inclusive  (default: from beginning)")
	parser.add_argument('--end', help="End Date inclusive (default: to end)")


	args = parser.parse_args()
	topic=HDFS_topic(topic=args.topic,user=args.hdfs_user,server=args.hdfs_server,port=args.hdfs_port,\
                     hdfs_tmp=args.hdfs_tmp,web_port=args.web_port,base=args.base)
	try:
		topic.merge(args.start,args.end)
	except Exception as err:
		print err
		exit

Ejemplo n.º 47

0

Mostrar archivo

Archivo: text.py Proyecto: luyajie/my_learning

from snakebite.client import Client

client = Client('localhost', 54310)
for l in client.text(['/input/input.txt']):
    print l

Ejemplo n.º 48

0

Mostrar archivo

Archivo: APICALLBATCH.py Proyecto: ehgp/DATA603_TwitterAPI-AI

#using responses, cURL and snakebite to access Hadoop Windows Instance VM and put files into HDFS folder.
import os
import requests
import json
from snakebite.client import Client
#hadoop connection
client1 = Client('localhost', 19000)
#Batch Twitter API connection
endpoint = "https://api.twitter.com/1.1/tweets/search/fullarchive/HistoricalTweets.json"
headers = {
    "Authorization": "Bearer xxxxxx",
    "Content-Type": "application/json"
}
# change your query here:
data = '{"query":"(AI OR Artificial Intelligence OR Machine Learning)", "fromDate": "201602020000", "toDate": "201902240000" , "maxResults":10}'
response = requests.post(endpoint, data=data, headers=headers).json()
file = json.dumps(response, indent=2)
# put downloaded data into local disk temporarily
with open('data.txt', 'w') as outfile:
    json.dump(file, outfile)
#for p in client1.mkdir(['/twitter_data_hist']):
#    print(p)
#for x in client1.ls(['/']):
#    print(x)
# this sends the file to hadoop
cmd = "hdfs dfs -put C:\\Users\\user\\Desktop\\data_603_Twitter_API\\data.txt /twitter_data_hist/twitterhistpy.json"
os.system(cmd)
# this removes the file from local directory
os.remove('data.txt')

Ejemplo n.º 49

0

Mostrar archivo

Archivo: minicluster_testbase.py Proyecto: ogrisel/snakebite

 def setUp(self):
     version = os.environ.get("HADOOP_PROTOCOL_VER", 7)
     self.cluster = self.__class__.cluster
     self.client = Client(self.cluster.host, self.cluster.port,
                          int(version))

Ejemplo n.º 50

0

Mostrar archivo

Archivo: hdfs.py Proyecto: cowlicks/distributed

def get_locations(filename, name_host, name_port, **kwargs):
    client = Client(name_host, name_port, use_trash=False)
    files = list(client.ls([filename]))
    return [pair for file in files for pair in find(file, client, **kwargs)]

Ejemplo n.º 51

0

Mostrar archivo

Archivo: effective_user_test.py Proyecto: zwotzie/snakebite-py3

 def setUp(self):
     self.custom_client = Client(self.cluster.host, self.cluster.port)
     self.custom_foobar_client = Client(host=self.cluster.host,
                                        port=self.cluster.port,
                                        effective_user='******')

Ejemplo n.º 52

0

Mostrar archivo

Archivo: hdfs_reader.py Proyecto: tsarouch/recycling_python_code

import os
from snakebite.client import Client

# provide the Internet Process Communcation Port
INTERNET_PROCESS_CIOMMUNICATION_PORT = "..."

# provide the Name Node of Hadoop
NAME_NODE = "..."

# and get the client of HDFS
CLIENT_HDFS = Client(NAME_NODE, INTERNET_PROCESS_CIOMMUNICATION_PORT)


def read_hdfs_file(file_path_and_name)
    """Reads an hdfs file
    :param meta_info_file: the path and the file to read
    """

    # 1. gets the hdfs file object
    for file_contents in CLIENT_HDFS.text([hdfs_file_name]):
        file_unicode = file_contents.decode('unicode-escape')
        file_obj = StringIO(file_unicode)

    # 2. read and operate on top:
    file_lines = get_hdfs_file_obj(meta_info_file).readlines()
    for line in file_lines:
        # ...
        # do operations on the file

Ejemplo n.º 53

0

Mostrar archivo

Archivo: list_directory.py Proyecto: zhenghongpeng/HadoopWithPython

from snakebite.client import Client

client = Client('localhost', 9000)
for x in client.ls(['/']):
    print x

Ejemplo n.º 54

0

Mostrar archivo

Archivo: document_vectorizer.py Proyecto: akshayjh/sparkgram

    def __init__(self, sc, doclist,
                 ngram_range = [1,1], vocab = None, stop_words = None, nmin = None, nmax = None,
                 num_partitions = None, features_max = None, tokenizer = alpha_tokenizer,
                 hashing = False, load_path = None, hdfs_namenode = None) :

        self._sc = sc
        self._ngram_range = ngram_range
        self._vocab = vocab
        self._stop_words = stop_words
        self._nmin = nmin
        self._nmax = nmax
        self._num_partitions = num_partitions
        self._doclist = doclist
        self._features_max = features_max if features_max is not None else 2**31
        self._tokenizer = tokenizer

        # initialie the RDDs
        self._doc_rdd = None
        self._ngram_rdd = None
        self._vocab_rdd = None
        self._docvec_rdd = None
        self._vocab_map_rdd = None

        # dictionary of RDDs 
        self.rdds = {}

        # initialize other properties
        self._nfeatures = None
        self._hashing = hashing

        # make the vocabulary a set if it isn't one already
        if type(vocab) is not set and vocab is not None: 
            try: 
                self._vocab = set(vocab)
            except TypeError : 
                raise TypeError("Vocabulary must be an iterable like a list, set, etc.")


        if load_path is not None : 
            if load_path[:4] != 'hdfs' : 
                for rdd_name in os.listdir(load_path) :
                    if rdd_name[-3:] == 'rdd' : 
                        self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name)
            
            # we're dealing with HDFS
            else : 
                try : 
                    from snakebite.client import Client
                except ImportError : 
                    raise ImportError("package snakebite is required for working with HDFS: pip install snakebite")
                
                if hdfs_namenode is None : 
                    # get the hadoop configuration files from user's environment and extract namenode 
                    import xml
                    hadoop_conf = '%s/core-site.xml'%os.environ['HADOOP_CONF_DIR']
                    tree = xml.etree.ElementTree.parse(hadoop_conf)
                    for prop in tree.findall('property') : 
                        if prop.find('name').text == 'fs.defaultFS' : 
                            dummy, hdfs_namenode, hdfs_port = prop.find('value').text.split(':')
                            hdfs_namenode = hdfs_namenode[2:]                            
                            break
 
                client = Client(hdfs_namenode, int(hdfs_port))
                for rdd_path_dict in client.ls([load_path[7:]]) : 
                    rdd_name = rdd_path_dict['path'].split('/')[-1]
                    if rdd_name[-3:] == 'rdd': 
                        self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name)
                    
            print 'Loaded %d RDDs: '%(len(self.rdds))
            for rdd in self.rdds.keys() :
                print rdd

                    
        # make the vital properties dictionary for pickling
        self.properties = {'ngram_range': ngram_range, 
                           'stop_words': stop_words,
                           'nmin': nmin, 
                           'nmax': nmax,
                           'num_partitions': num_partitions,
                           'doclist': doclist,
                           'features_max': features_max,
                           'hashing': hashing,
                           }

Ejemplo n.º 55

0

Mostrar archivo

Archivo: copy_to_local.py Proyecto: zhenghongpeng/HadoopWithPython

from snakebite.client import Client

client = Client('localhost', 9000)
for f in client.copyToLocal(['/input/input.txt'], '/tmp'):
   print f

Ejemplo n.º 56

0

Mostrar archivo

from snakebite.client import Client

client = Client('localhost', 54310)
for p in client.mkdir(['/foo/bar', '/input'], create_parent=True):
    print p

Ejemplo n.º 57

0

Mostrar archivo

Archivo: crfalign.py Proyecto: cjsanjay/dig-crf

def crfalign(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True):

    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    def cpath(n):
        return os.path.join(crfConfigDir, n)

    smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"),
                               config_path=cpath("eyeColor_config.txt"))
    smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"),
                                config_path=cpath("hairColor_config.txt"))
    print smEyeColor, smHairColor

    if location == "hdfs":
        if deleteFirst:
            namenode = "memex-nn1"
            port = 8020
            client = Client(namenode, 8020, use_trash=True)
            try:
                for deleted in client.delete([outputDirectory], recurse=True):
                    print deleted
            except FileNotFoundException as e:
                pass

    # hypothesis1: data fetched this way prompts the lzo compression error
    # hypothesis2: but it doesn't matter, error is just a warning
    if partitions:
        if limit:
            rdd_crfl = sc.parallelize(rdd_crfl.take(limit))
            rdd_crfl = rdd_crfl.repartition(partitions)
        else:
            print inputFilename
            rdd_crfl = sc.textFile(inputFilename, minPartitions=partitions)
    else:
        rdd_crfl = sc.textFile(inputFilename)
    rdd_crfl.setName('rdd_crfl')
    # rdd_crfl.persist()
    print "beginning: %s partitions" % rdd_crfl.getNumPartitions()

    # "value-only" RDD, not a pair RDD
    # but we have the URI in the -3 position
    # and the index in the -2 position
    rdd_withuri = rdd_crfl.map(lambda x: reconstructTuple(x))

    # Note: groupByKey returns iterable, not data; so no point in printing
    rdd_grouped = rdd_withuri.groupByKey()
    # sort the vectors by index (within key groups)
    rdd_sorted = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))])
    # find all contiguous spans of marked-up tokens
    # returns 0 or more dicts per URI key
    rdd_spans = rdd_sorted.mapValues(lambda x: computeSpans(x, indexed=True))
    # flatten to (URI, single dict) on each line
    rdd_flat = rdd_spans.flatMapValues(lambda x: list(x))
    # rdd_flat = rdd_flat.coalesce(rdd_flat.getNumPartitions() / 3)
    # # map any eyeColor spans using smEyeColor, hairType spans using smHairColor
    # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor}))
    rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor.findBestMatch, "hairType": smHairColor.findBestMatch}))
    # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": fakeFindBestMatch, "hairType": fakeFindBestMatch}))
    # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {}))
    # rdd_aligned = rdd_spans

    # rdd_final = rdd_crfl
    rdd_final = rdd_aligned
    print outputFormat
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        print "saving to %s" % outputDirectory
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)