class EffectiveUserTest(MiniClusterTestBase): ERR_MSG_TOUCH = "org.apache.hadoop.security.AccessControlException\nPermission denied: user=__foobar" ERR_MSG_STAT = "`/foobar2': No such file or directory" VALID_FILE = '/foobar' INVALID_FILE = '/foobar2' def setUp(self): self.custom_client = Client(self.cluster.host, self.cluster.port) self.custom_foobar_client = Client(host=self.cluster.host, port=self.cluster.port, effective_user='******') def test_touch(self): print tuple(self.custom_client.touchz([self.VALID_FILE])) try: tuple(self.custom_foobar_client.touchz([self.INVALID_FILE])) except Exception, e: self.assertTrue(e.message.startswith(self.ERR_MSG_TOUCH)) self.custom_client.stat([self.VALID_FILE]) try: self.custom_client.stat([self.INVALID_FILE]) except Exception, e: self.assertEquals(e.message, self.ERR_MSG_STAT)
def signature(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) stats = client.stat([self._partial]) if stats['file_type'] == 'f': return "modification_time:{}".format(stats['modification_time']) else: return stats['file_type']
def test(): """ """ client = Client("192.168.99.100", 9000) for f in client.ls(['/files']): print f for line in client.cat([f.get('path')]): for l in line: print l
def run(self): c = Client(self.host, self.port) listing = c.ls([self.log_path], recurse=True) events = [] for f in listing: path = f['path'] if not path.endswith('.jhist'): continue ts = arrow.get(f['modification_time']/1000) if ts <= self.checktime: continue job_id = job_pattern.match(path.split('/')[-1]).group(0) if job_id in self.jobs and self.jobs[job_id] >= ts.timestamp*1000: log.debug('Skipping processed job: ' + job_id) continue config_path = path[:path.rfind('/')]+'/'+job_id+'_conf.xml' event = { 'inviso.type': 'mr2', 'job.id': job_id, 'application.id': job_id.replace('job_', 'application_'), 'job.type': 'mr2', 'file.type': ['history', 'config'], 'jobflow' : self.jobflow, 'cluster.id': self.cluster_id, 'cluster': self.cluster_name, 'history.uri': 'hdfs://%s:%s%s' % (self.host,self.port,path), 'config.uri':'hdfs://%s:%s%s' % (self.host,self.port,config_path), 'host': self.host, 'port': self.port, 'timestamp': str(ts), 'epoch': f['modification_time'], 'mapreduce.version': 'mr2' } log.info('Publishing event: (%s) %s %s' % (event['cluster'], event['job.id'], ts)) events.append(event) for chunk in [events[i:i + self.chunk_size] for i in xrange(0, len(events), self.chunk_size)]: self.publisher.publish(chunk)
def delete_item(config, filepath='', localpath=''): if(config['BACKEND'] == 'hdfs'): client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False) del_gen = client.delete([filepath], recurse=True) for del_item in del_gen: pass elif(config['BACKEND'] == 'swift'): pass # To be implemented # Deleting modules or datasets from local directories (will also suffice for nfs backend) if(os.path.isdir(localpath)): # Check if it is a dataset shutil.rmtree(localpath) else: try: os.remove(localpath) except OSError: pass
def crfalign(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True): # crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") # def cpath(n): # return os.path.join(crfConfigDir, n) # smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"), # config_path=cpath("eyeColor_config.txt")) # smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"), # config_path=cpath("hairColor_config.txt")) # print smEyeColor, smHairColor if location == "hdfs": if deleteFirst: namenode = "memex-nn1" port = 8020 client = Client(namenode, 8020, use_trash=True) try: for deleted in client.delete([outputDirectory], recurse=True): print deleted except FileNotFoundException as e: pass # hypothesis1: data fetched this way prompts the lzo compression error # hypothesis2: but it doesn't matter, error is just a warning rdd_crfl = sc.textFile(inputFilename) rdd_crfl.setName('rdd_crfl') if limit: rdd_crfl = sc.parallelize(rdd_crfl.take(limit)) if partitions: rdd_crfl = rdd_crfl.repartition(partitions) rdd_final = rdd_crfl print outputFormat if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": print "saving to %s" % outputDirectory rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
def __init__(self, workflow, **kwargs): super(HDFSTextLoader, self).__init__(workflow, **kwargs) self.file_name = kwargs["file"] self.chunk_lines_number = kwargs.get("chunk", 1000) client_kwargs = dict(kwargs) del client_kwargs["file"] if "chunk" in kwargs: del client_kwargs["chunk"] self.hdfs_client = Client(**client_kwargs) self.output = [""] * self.chunk_lines_number self.finished = Bool()
def getTrainedModel(hdfsServer, modelFile): hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + modelFile + '/' # Load the saved model data hdfs_client = Client(hdfsServer, hdfsPort) filesInfo = hdfs_client.ls([modelSavePath]) # Copy HDFS files to local temp directory # First clean up and recreate the temp folder copyDir = tempfile.gettempdir() + "/" + modelFile shutil.rmtree(copyDir, ignore_errors=True) os.makedirs(copyDir) res = hdfs_client.copyToLocal([f['path'] for f in filesInfo], copyDir) for r in res: if not r['result']: print "Error: %s" % r modelFilePath = copyDir + '/' + modelFile print "Load model from %s" % modelFilePath return joblib.load(modelFilePath)
def getObjsBackend(objs, backend, config): if(backend == 'hdfs'): client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False) for obj in objs: try: copy_gen = client.copyToLocal([obj[0]], obj[1]) for copy_item in copy_gen: pass except Exception as e: print(e) elif(backend == 'swift'): options = {'os_auth_url': os.environ['OS_AUTH_URL'], 'os_username': os.environ['OS_USERNAME'], 'os_password': os.environ['OS_PASSWORD'], 'os_tenant_id': os.environ['OS_TENANT_ID'], 'os_tenant_name': os.environ['OS_TENANT_NAME']} swiftService = SwiftService(options=options) for obj in objs: # Create the containers which are used in this application for Object Storage if(obj[0] == 'sqlite.db'): swiftService.post(container='containerFiles') swiftService.post(container='containerFeatures') swiftService.post(container='containerModules') out_file = obj[1] # Get the output file location from runner localoptions = {'out_file': out_file} objects = [] objects.append(obj[0]) swiftDownload = swiftService.download(container='containerModules', objects=objects, options=localoptions) for downloaded in swiftDownload: if("error" in downloaded.keys()): raise RuntimeError(downloaded['error']) # print(downloaded) elif(backend == 'nfs'): # Every file is already in respective local dirs pass
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' connections = self.get_connections(self.hdfs_conn_id) client = None if len(connections) == 1: client = Client(connections[0].host, connections[0].port) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def get_conn(self) -> Any: """Returns a snakebite HDFSClient object.""" # When using HAClient, proxy_user must be the same, so is ok to always # take the first. effective_user = self.proxy_user autoconfig = self.autoconfig use_sasl = conf.get('core', 'security') == 'kerberos' try: connections = self.get_connections(self.hdfs_conn_id) if not effective_user: effective_user = connections[0].login if not autoconfig: autoconfig = connections[0].extra_dejson.get( 'autoconfig', False) hdfs_namenode_principal = connections[0].extra_dejson.get( 'hdfs_namenode_principal') except AirflowException: if not autoconfig: raise if autoconfig: # will read config info from $HADOOP_HOME conf files client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) elif len(connections) == 1: client = Client( connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) elif len(connections) > 1: name_node = [ Namenode(conn.host, conn.port) for conn in connections ] client = HAClient( name_node, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal, ) else: raise HDFSHookException( "conn_id doesn't exist in the repository and autoconfig is not specified" ) return client
def process_worker(queue): client = Client("trevally.amer.nevint.com", 9000, use_trash=False, effective_user='******') while True: afile = queue.get() print afile try: process(client, afile) except Exception as e: print e finally: queue.task_done()
class HDFSTextLoader(Unit, TriviallyDistributable): def __init__(self, workflow, **kwargs): super(HDFSTextLoader, self).__init__(workflow, **kwargs) self.file_name = kwargs["file"] self.chunk_lines_number = kwargs.get("chunk", 1000) client_kwargs = dict(kwargs) del client_kwargs["file"] if "chunk" in kwargs: del client_kwargs["chunk"] self.hdfs_client = Client(**client_kwargs) self.output = [""] * self.chunk_lines_number self.finished = Bool() def initialize(self): self.debug("Opened %s", self.hdfs_client.stat([self.file_name])) self._generator = self.hdfs_client.text([self.file_name]) def run(self): assert not self.finished try: for i in range(self.chunk_lines_number): self.output[i] = next(self._generator) except StopIteration: self.finished <<= True
def __init__(self, height=28, width=28, channels=1, batch_size=32, images_uri='/', shuffle=True): 'Initialization' self.height = height self.width = width self.channels = channels self.batch_size = batch_size self.shuffle = shuffle self.images_uri = images_uri o = urlparse(self.images_uri) if o.scheme == 'hdfs': self.images_path = o.path self.client = Client( o.hostname, o.port ) # images_uri: 'hdfs://10.0.40.19:9600/daloflow/dataset32x32/' else: self.images_path = images_uri self.client = None
def main(queue): client = Client(host, port, use_trash=False, effective_user='******') def find_minutes(path, level, result): for x in client.ls([path]): if level < 5: find_minutes(x['path'], level + 1, result) else: result.append(x['path']) min_list = [] find_minutes('/data/hub/vehicle/MKZ-Grey/2017/08/31', 4, min_list) for each in min_list: print each queue.put(each)
class HDFSStat(object): cluster = 'hostname' port = 8020 default_path = '/user/hive/warehouse' @staticmethod def build_path(table): nm = table.split('.')[0] tb = table.split('.')[1] return default_path + '/' + nm + '.db/' + tb def __init__(self): self.client = Client(HDFSStat.cluster, HDFSStat.port, use_trash=False) def latest_partition(self, table_name, table_path=None): t_path = HDFSStat.build_path(table_name) if table_path is None else table_path latest_dir = list(self.client.ls([t_path])).pop() return path.basename(latest_dir['path']).split('=')[1] def poke_partition(self, table_name, partition_name, partition, table_path=None): t_path = HDFSStat.build_path(table_name) if table_path is None else table_path partition_path = t_path + '/' + partition_name + '=' + partition return self.client.test(partition_path, exists=True, directory=True, zero_length=False)
def __init__(self, path, name_node, hive_server, user="******", hive_db="default", password=None, nn_port=8020, hive_port=10000): # HDFS Connection self._client = Client(name_node, nn_port) self._db = hive_db # Hive Connection self._hive = pyhs2.connect(host=hive_server, port=hive_port, authMechanism="PLAIN", database=hive_db, user=user, password=password) self._path = path
def __init__(self,topic,user,server,port,web_port,base,hdfs_tmp): self.topic = topic self.username = user self.server = server self.port = port self.base = base self.path = ["%s/%s" % (base,topic)] self.hdfs_tmp = hdfs_tmp try: self.client=Client(server,port,effective_user=user) self.hdfsclient=hdfs.client.InsecureClient(\ "http://%s:%d" % (server,web_port),user=user) self.daylist=self.check() except: print "Base path %s does not contain valid structure" % (base) raise
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' use_sasl = False if conf.get('core', 'security') == 'kerberos': use_sasl = True connections = self.get_connections(self.hdfs_conn_id) client = None if len(connections) == 1: client = Client(connections[0].host, connections[0].port, use_sasl=use_sasl) elif len(connections) > 1: nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, use_sasl=use_sasl) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ if self.pid != os.getpid() or not self._bite: client_kwargs = dict(filter(lambda k_v: k_v[1] is not None and k_v[1] != '', { 'hadoop_version': self.config.getint("hdfs", "client_version", None), 'effective_user': self.config.get("hdfs", "effective_user", None) }.iteritems())) if self.config.getboolean("hdfs", "snakebite_autoconfig", False): """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient self._bite = AutoConfigClient(**client_kwargs) else: from snakebite.client import Client self._bite = Client(self.config.get("hdfs", "namenode_host"), self.config.getint("hdfs", "namenode_port"), **client_kwargs) return self._bite
def get_conn(self): ''' Returns a snakebite HDFSClient object. ''' connections = self.get_connections(self.hdfs_conn_id) use_sasl = False if configuration.get('core', 'security') == 'kerberos': use_sasl = True client = None ''' When using HAClient, proxy_user must be the same, so is ok to always take the first ''' effective_user = self.proxy_user or connections[0].login if len(connections) == 1: autoconfig = connections[0].extra_dejson.get('autoconfig', False) if autoconfig: client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) else: hdfs_namenode_principal = connections[0].extra_dejson.get( 'hdfs_namenode_principal') client = Client( connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal) elif len(connections) > 1: hdfs_namenode_principal = connections[0].extra_dejson.get( 'hdfs_namenode_principal') nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def compose_hdfs_commands(year, month, day, args, config): # set up the hdfs client to be used in order to check the files namenode = config.get("HDFS", "namenode") client = Client(namenode.hostname, namenode.port, use_trash=False) # hdfs sync path for the tenant hdfs_user = config.get("HDFS", "user") tenant = args.tenant hdfs_sync = config.get("HDFS", "path_sync") hdfs_sync = hdfs_sync.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl() # dictionary holding all the commands with their respective arguments' name hdfs_commands = dict() # file location of metric profile (local or hdfs) hdfs_commands["--sync.mps"] = date_rollback( hdfs_sync + "/" + args.report + "/" + "metric_profile_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of operations profile (local or hdfs) hdfs_commands["--sync.ops"] = hdfs_check_path( hdfs_sync + "/" + args.tenant + "_ops.json", client) # file location of aggregations profile (local or hdfs) hdfs_commands["--sync.apr"] = hdfs_check_path( hdfs_sync + "/" + args.tenant + "_" + args.report + "_ap.json", client) # file location of endpoint group topology file (local or hdfs) hdfs_commands["-sync.egp"] = date_rollback( hdfs_sync + "/" + args.report + "/" + "group_endpoints_" + "{{date}}" + ".avro", year, month, day, config, client) return hdfs_commands
def compose_hdfs_commands(year, month, day, args, config): # set up the hdfs client to be used in order to check the files namenode = config.get("HDFS", "namenode") client = Client(namenode.hostname, namenode.port, use_trash=False) # hdfs sync path for the tenant hdfs_user = config.get("HDFS", "user") tenant = args.tenant hdfs_sync = config.get("HDFS", "path_sync") hdfs_sync = hdfs_sync.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl() hdfs_metric = config.get("HDFS", "path_metric") hdfs_metric = hdfs_metric.fill(namenode=namenode.geturl(), hdfs_user=hdfs_user, tenant=tenant).geturl() # dictionary holding all the commands with their respective arguments' name hdfs_commands = dict() # file location of previous day's metric data (local or hdfs) hdfs_commands["--pdata"] = hdfs_check_path( hdfs_metric + "/" + str(datetime.date(year, month, day) - datetime.timedelta(1)), client) # file location of target day's metric data (local or hdfs) hdfs_commands["--mdata"] = hdfs_check_path(hdfs_metric + "/" + args.date, client) # file location of report configuration json file (local or hdfs) hdfs_commands["--conf"] = hdfs_check_path(hdfs_sync + "/" + args.tenant+"_"+args.report+"_cfg.json", client) # file location of metric profile (local or hdfs) hdfs_commands["--mps"] = date_rollback( hdfs_sync + "/" + args.report + "/" + "metric_profile_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of operations profile (local or hdfs) hdfs_commands["--ops"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_ops.json", client) # file location of aggregations profile (local or hdfs) hdfs_commands["--apr"] = hdfs_check_path(hdfs_sync+"/"+args.tenant+"_"+args.report+"_ap.json", client) if args.thresholds: # file location of thresholds rules file (local or hdfs) hdfs_commands["--thr"] = hdfs_check_path( os.path.join(hdfs_sync, "".join([args.tenant, "_", args.report, "_thresholds.json"])), client) # file location of endpoint group topology file (local or hdfs) hdfs_commands["-egp"] = date_rollback( hdfs_sync + "/" + args.report + "/" + "group_endpoints_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of group of groups topology file (local or hdfs) hdfs_commands["-ggp"] = date_rollback(hdfs_sync + "/" + args.report + "/" + "group_groups_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of weights file (local or hdfs) hdfs_commands["--weights"] = date_rollback(hdfs_sync + "/" + args.report + "/weights_" + "{{date}}" + ".avro", year, month, day, config, client) # file location of downtimes file (local or hdfs) hdfs_commands["--downtimes"] = hdfs_check_path( hdfs_sync + "/" + args.report + "/downtimes_" + str(datetime.date(year, month, day)) + ".avro", client) # file location of recomputations file (local or hdfs) # first check if there is a recomputations file for the given date # recomputation lies in the hdfs in the form of # /sync/recomp_TENANTNAME_ReportName_2018-08-02.json if client.test(urlparse(hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json").path, exists=True): hdfs_commands["--rec"] = hdfs_sync+"/recomp_"+args.tenant+"_"+args.report+"_"+args.date+".json" else: hdfs_commands["--rec"] = hdfs_check_path(hdfs_sync+"/recomp.json", client) return hdfs_commands
from snakebite.client import Client from constants import NAMENODE_PORT client = Client('localhost', NAMENODE_PORT) for p in client.mkdir(['/foo/bar', '/input'], create_parent=True): print p
class HdfsReader: """ HdfsReader class Connects to an hdfs endpoint (namenode) and checks argo profile files stored there Uses a specific base path for determining argo file destinations """ def __init__(self, namenode, port, base_path): """ Initialized HdfsReader which is used to check/read profile files from hdfs Args: namenode: str. hdfs namenode host port: int. hdfs namenode port base_path: str. base path to destination used for argo """ self.client = Client(namenode, port) self.base_path = base_path def gen_profile_path(self, tenant, report, profile_type): """ Generates a valid hdfs path to a specific profile Args: tenant: str. tenant to be used report: str. report to be used profile_type: str. profile_type (operations|reports|aggregations|thresholds) Returns: str: hdfs path """ templates = dict() templates.update({ 'operations': '{0}_ops.json', 'aggregations': '{0}_{1}_ap.json', 'reports': '{0}_{1}_cfg.json', 'thresholds': '{0}_{1}_thresholds.json', 'recomputations': 'recomp.json' }) sync_path = self.base_path.replace("{{tenant}}", tenant) filename = templates[profile_type].format(tenant, report) return os.path.join(sync_path, filename) def cat(self, tenant, report, profile_type): """ Returns the contents of a profile stored in hdfs Args: tenant: str. tenant name report: str. report name profile_type: str. profile type (operations|reports|aggregations|thresholds) Returns: """ path = self.gen_profile_path(tenant, report, profile_type) try: txt = self.client.cat([path]) j = json.loads(txt.next().next()) return j, True except FileNotFoundException: return None, False def rem(self, tenant, report, profile_type): """ Removes a profile file that already exists in hdfs (in order to be replaced) Args: tenant: str. tenant name report: str. report name profile_type: str. profile type (operations|reports|aggregations|thresholds) Returns: """ path = self.gen_profile_path(tenant, report, profile_type) try: self.client.delete([path]).next() return True except FileNotFoundException: return False
# pipe '|' is forbidden in wiki titles and would make a good delimiter out_str = page.encode('utf-8').strip( ) + "|" + date_str + pageview_str + daily_trend_str + weekly_trend_str + monthly_trend_str return out_str # define spark context conf = (SparkConf().setAppName("Wiki Page Views Trends").set( "spark.hadoop.validateOutputSpecs", "false")) sc = SparkContext(conf=conf) # set custom connection if (run_mode == "hdfs" or out_mode == "hdfs"): # spotify's snakebite as hdfs client hdfs_client = Client(cfg.get("hdfs", "hdfs_master_hostname"), 9000, use_trash=False) if (run_mode == "swift" or out_mode == "swift"): swiftConf = sc._jsc.hadoopConfiguration() for key, value in SWIFT_DEFAULT_CONFIGS.items(): swiftConf.set(key, value) swift_client = swift.Connection(user=swift_user, key=swift_key, authurl=swift_authurl) # read list of files src_files = [] if run_mode == "hdfs":
# Create kafka client print "Create kafka client to: %s" % args.kafka kafka = KafkaClient(args.kafka + ':9092') producer = SimpleProducer(kafka) # Read testing data from hdfs hdfsServer = args.hdfs hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort) topic = args.topic from snakebite.client import Client print "Reading input from HDFS: server=%s, port=%d" % (hdfsServer, hdfsPort) client = Client(hdfsServer, hdfsPort) data_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_test.txt"]).next() label_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_test.txt"]).next() samples = data_file.splitlines() labels = label_file.splitlines() test_data = zip(samples, labels) random.shuffle(test_data) # Shuffle it import random import time import itertools def getActivityName(a): a = int(a) if a in range(1,7):
from snakebite.client import Client client = Client('localhost', 9000) for x in client.ls(['/']): print x
def exists(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) return client.test(self._partial, exists=True)
class HDFS_topic(object): def __init__(self, topic, user, server, port, web_port, base, hdfs_tmp): self.topic = topic self.username = user self.server = server self.port = port self.base = base self.path = ["%s/%s" % (base, topic)] self.hdfs_tmp = hdfs_tmp try: self.client = Client(server, port, effective_user=user) self.hdfsclient=hdfs.client.InsecureClient(\ "http://%s:%d" % (server,web_port),user=user) self.daylist = self.check() except: print "Base path %s does not contain valid structure" % (base) raise # # Check basic hdfs access and that directory format is appropiate # also builds datelist structure # def check(self): self.content = self.client.ls(self.path) ret = [] for item in self.content: (head, tail) = os.path.split(item['path']) try: parse(tail, yearfirst=True, dayfirst=True) if item['file_type'] == 'd': ret.append(tail) else: print("WARNING: %s is not a directory, skipping\n" % (item['path'])) except: print("WARNING: %s is not in date format, skipping\n" % (tail)) if len(ret) > 0: ret.sort(key=lambda x: datetime.strptime(x, "%Y-%m-%d")) return ret else: return false # # Give a date, check if that date is on the dirlist and return matching dir entry # def day_in_topic(self, date): for item in self.daylist: if parse(date) == parse(item): return item return False # # Check and validates date_from and date_to arguments # def check_date_range(self, date_from, date_to): if date_from: try: parse(date_from) except: raise ValueError("FATAL: start date (%s) invalid date format" % (date_from)) if (parse(date_from) < parse(self.daylist[0])) or ( parse(date_from) > parse(self.daylist[-1])): raise ValueError( "FATAL: start date (%s) not in range (%s ---> %s)" % (date_from, self.daylist[0], self.daylist[-1])) else: ret_from = parse(date_from).strftime("%Y-%m-%d") while not self.day_in_topic(ret_from): print "WARNING: start date %s not in topic %s, trying next day" % ( ret_from, self.topic) ret_from = datetime.strftime( (parse(ret_from) + timedelta(days=1)), "%Y-%m-%d") ret_from = self.day_in_topic(ret_from) else: ret_from = self.daylist[0] if date_to: try: parse(date_to) except: raise ValueError("FATAL: end date (%s) invalid date format" % (date_to)) if (parse(date_to) < parse(self.daylist[0])) or ( parse(date_to) > parse(self.daylist[-1])): raise ValueError( "FATAL: end date (%s) not in range (%s ---> %s)" % (date_to, self.daylist[0], self.daylist[-1])) else: ret_to = parse(date_to).strftime("%Y-%m-%d") else: ret_to = self.daylist[-1] if (parse(ret_from) > parse(ret_to)): raise ValueError( "FATAL: start date (%s) must be <= end date (%s)" % (ret_from, ret_to)) return (ret_from, ret_to) # # Traverses the list of valid directories and merges each day # def merge(self, date_from="", date_to=""): day = "" try: (day, date_to) = self.check_date_range(date_from, date_to) except Exception as err: raise ValueError(err) print "INFO: Trying to merge %s from %s to %s\n" % (self.topic, day, date_to) while (parse(day) <= parse(date_to)): if self.day_in_topic(day): self.merge_day(day) else: print "WARNING: %s is not on %s, skipping\n" % (day, self.path) day = datetime.strftime((parse(day) + timedelta(days=1)), "%Y-%m-%d") while not self.day_in_topic(day) and parse(day) <= parse(date_to): print "WARNING: %s not found in %s, trying next day" % ( day, self.topic) day = datetime.strftime((parse(day) + timedelta(days=1)), "%Y-%m-%d") day = self.day_in_topic(day) if not day: return return True # # Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything # def merge_day(self, date): print "INFO: processing ", date daytmp = "%s/snappymerge-%s-tmp" % (self.hdfs_tmp, date) daypath = ["%s/%s/%s/" % (self.base, self.topic, date)] #mergedfile="./%s-merged.snappy" % (date) mergedfile = "./%s-merged.snappy" % (datetime.strftime( datetime.now(), "%Y-%d-%m.%f")) day_files = [x['path'] for x in self.client.ls(daypath)] print "INFO: DAYPATH: ", daypath try: os.remove(mergedfile) except: pass if len([x for x in day_files if x.endswith('.snappy')]) <= 1: print "WARNING: %s does not have enough files to getmerge, skipping" % ( date) return if [file for file in day_files if not file.endswith('.snappy')]: print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % ( daypath, file, daytmp) self.merge_with_move(daypath[0], daytmp, day_files, mergedfile) else: print "INFO: MERGING ", daypath[0] result = self.client.getmerge(daypath[0], mergedfile) print[x for x in result if not x['result']] print "INFO: DELETING original files in ", daypath[0] for file in day_files: print "INFO: Deleting original file ", file self.hdfsclient.delete(file) print "INFO: UPLOADING merged (%s) to %s" % (mergedfile, daypath[0]) self.hdfsclient.upload(daypath[0], mergedfile, overwrite=True) os.remove(mergedfile) return # # When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads # def merge_with_move(self, day_path, day_tmp, dayfiles, merged_file): self.hdfsclient.makedirs(day_tmp) print "INFO: MOVING files to ", day_tmp snap = [x for x in dayfiles if x.endswith(".snappy")] result = self.client.rename(snap, day_tmp) print[x['path'] for x in result if not x['result']] print "INFO: MERGING files in ", day_tmp result = self.client.getmerge(day_tmp, merged_file) print[x['path'] for x in result if not x['result']] print "INFO: UPLOADING merged (%s) to %s" % (merged_file, day_path) self.hdfsclient.upload(day_path, merged_file, overwrite=True) os.remove(merged_file) print "INFO: Deleting files on ", day_tmp self.hdfsclient.delete(day_tmp, recursive=True)
import argparse import subprocess parser = argparse.ArgumentParser() parser.add_argument("--hdfs", help="HDFS FS name", default = 'localhost') parser.add_argument("--model", help="Name of model file", default = 'belt.model') args = parser.parse_args() hdfsServer = args.hdfs hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort) modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + args.model + "/" print "hdfs=%s, savePath=%s, hdfsHost=%s" % (hdfsServer, modelSavePath, hdfsHost) hdfs_client = Client(hdfsServer, hdfsPort) X_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_train.txt"]).next() y_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_train.txt"]).next() X_train = np.genfromtxt(str.splitlines(X_train_file)) y_train = np.genfromtxt(str.splitlines(y_train_file)) clf = LogisticRegression() clf = clf.fit(X_train, y_train) files = joblib.dump(clf, "belt.model") subprocess.check_call(['hdfs', 'dfs', '-rm', '-r', '-f', modelSavePath], shell=False) subprocess.check_call(['hdfs', 'dfs', '-mkdir', '-p', modelSavePath], shell=False)
#!/usr/bin/env python import os from snakebite.client import Client client = Client("trevally.amer.nevint.com", 9000, use_trash=False, effective_user='******') #for res in client.mkdir(['/user/hadoop/test/move/file'],create_parent=True, mode=755): # print res for res in client.rename(['/user/hadoop/test.tar'],'/user/hadoop/test3.tar'): print res
def is_exist(dirPath, master = public.SPARK_MASTER, port = public.SPARK_MASTER_PORT): client = Client(master, port, use_trash=False) return client.test(dirPath, exists=True, directory=True)
from snakebite.client import Client client = Client('localhost', 9000) for l in client.text(['/input/input.txt']): print l
def remove(self): client = Client(self._host, self._port, effective_user=self._user, use_trash=False) it = client.delete([self._partial], recurse=True) for elmt in it: pass
def get_locations(filename, name_host, name_port, data_root='/data/dfs/dn'): client = Client(name_host, name_port, use_trash=False) files = list(client.ls([filename])) return [pair for file in files for pair in find(file, client, data_root)]
def test_request(self): from snakebite.client import Client client = Client("10.0.137.24", 8022, use_trash=False) for x in client.ls(['/user']): print x
def health_check(): c = Client("namenode", 8020) print "Checking for %s directory..." % sys.argv[1] for top_level in c.ls([sys.argv[1]]): print "DIR CHILD=%s" % top_level['path'] print "Ok!"
def main(args): xml = minidom.parse(path.join(os.environ["HADOOP_HOME"], "etc", "hadoop", "hdfs-site.xml")) element = [ x for x in xml.getElementsByTagName("property") if (x.getElementsByTagName("name")[0] .childNodes[0] .nodeValue == "dfs.namenode.http-address") ][0] namenode = (element.getElementsByTagName("value")[0] .childNodes[0] .nodeValue.split(":")[0]) fs = HDFS(namenode, 8020) path_prefix = "/amplab/text" for size in args.sizes: timings = {} MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() file_list = None if c_rank == 0: file_list = [ entry["path"] for entry in fs.ls([ path.join(path_prefix, size, "uservisits")])] file_list = [file_list[i::c_size] for i in range(c_size)] file_list = MPI.COMM_WORLD.scatter(file_list, root=0) MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["open-and-register"] = toc() MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() os_results = reduce_data(row_iterator(file_list, fs), 4, "os") MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["q-stats-by-os"] = toc() if c_rank == 0: os_results.index = os_results.pop("os") MPI.COMM_WORLD.Barrier() if c_rank == 0: tic() browser_results = reduce_data(row_iterator(file_list, fs), 6, "browser") MPI.COMM_WORLD.Barrier() if c_rank == 0: timings["q-stats-by-browser"] = toc() if c_rank == 0: browser_results.index = browser_results.pop("browser") if c_rank == 0: top_dir = path.join("results", size, "mpi", str(args.nodes)) mkdir_p(top_dir) with open(path.join(top_dir, "timings"), "w") as f: for entry in timings.items(): f.write("%s, %.18e\n" % entry) f.flush() browser_results.to_pickle(path.join(top_dir, "browser")) os_results.to_pickle(path.join(top_dir, "os")) return 0
class DataGenerator(object): 'Generates data for Keras' ''' Initialization function of the class ''' def __init__(self, height=28, width=28, channels=1, batch_size=32, cache_mode='', images_uri='/', shuffle=True): 'Initialization' self.debug = False self.height = height self.width = width self.channels = channels self.batch_size = batch_size self.shuffle = shuffle self.cache_mode = cache_mode self.images_uri = images_uri o = urlparse(self.images_uri) if o.scheme == 'hdfs': self.images_path = o.path self.client = Client( o.hostname, o.port ) # images_uri: 'hdfs://10.0.40.19:9600/daloflow/dataset32x32/' else: self.images_path = images_uri self.client = None ''' Set debug mode True/False ''' def set_debug(self, debug_mode): 'Do not show or show messages' self.debug = debug_mode if self.debug == True: print(' * Debug mode: ' + self.debug) print(' * Height: ' + self.height) print(' * Width: ' + self.width) print(' * Channels: ' + self.channels) print(' * Batch_size: ' + self.batch_size) print(' * Shuffle: ' + self.shuffle) print(' * Cache mode: ' + self.cache_mode) print(' * Image uri: ' + self.images_uri) ''' Goes through the dataset and outputs one batch at a time. ''' def generate(self, labels, list_IDs, yield_labels=True): 'Generates batches of samples' # Infinite loop while 1: # Generate random order of exploration of dataset (to make each epoch different) indexes = self.__get_exploration_order(list_IDs) # Generate batches imax = int(len(indexes) / self.batch_size) # number of batches for i in range(imax): # Find list of IDs for one batch list_IDs_temp = [ list_IDs[k] for k in indexes[i * self.batch_size:(i + 1) * self.batch_size] ] # Train, validation X, y = self.__data_generation(labels, list_IDs_temp, yield_labels) yield X, y ''' Generates a random order of exploration for a given set of list_IDs. If activated, this feature will shuffle the order in which the examples are fed to the classifier so that batches between epochs do not look alike. Doing so will eventually make our model more robust. ''' def __get_exploration_order(self, list_IDs): 'Generates order of exploration' # Find exploration order indexes = np.arange(len(list_IDs)) if self.shuffle == True: np.random.shuffle(indexes) return indexes ''' Get data: local ''' def __get_data_local(self, image_file_name): 'Get data from local file system path' pixels = None try: with open(image_file_name, 'rb') as image_file: pixels = np.fromstring(zlib.decompress(image_file.read()), dtype=np.uint8, sep='').reshape(self.height, self.width, self.channels) except: if self.debug == True: print('Exception ' + str(sys.exc_info()[0]) + ' on file ' + image_file_name) return pixels ''' Get data: remote ''' def __get_data_remote(self, image_file_name): 'Get data from HDFS' pixels = None if self.client == None: return pixels try: t = '/tmp/image.dat.' + str(os.getpid()) if os.path.exists(t): os.remove(t) for f in self.client.copyToLocal([image_file_name], t): if f['result'] == True: with open(t, 'rb') as image_file: pixels = np.fromstring( zlib.decompress(image_file.read()), dtype=np.uint8, sep='').reshape(self.height, self.width, self.channels) os.remove(t) else: print('File ' + f['path'] + ' NOT copied because "' + str(f['error']) + '", sorry !') except: if self.debug == True: print('Exception ' + str(sys.exc_info()[0]) + ' on file ' + image_file_name) return pixels ''' Get data: local or remote ''' def __get_data(self, image_file_name): 'Get data: local or remote' pixels = None #print(' * image file name: ' + image_file_name) if self.cache_mode == 'hdfs2local' or self.cache_mode == 'hdfs2local-full': pixels = self.__get_data_local(image_file_name) elif self.cache_mode == 'nocache': pixels = self.__get_data_remote(image_file_name) elif self.cache_mode == 'hdfs2local-partial': pixels = self.__get_data_local(image_file_name) if pixels == None: pixels = self.__get_data_remote(image_file_name) else: print('ERROR: unknown "' + self.cache_mode + '" cache mode') return pixels ''' Outputs batches of data and only needs to know about the list of IDs included in batches as well as their corresponding labels. ''' def __data_generation(self, labels, list_IDs_temp, yield_labels): 'Generates data of batch_size samples' # X : (n_samples, v_size, v_size, v_size, n_channels) # Initialization X = np.empty((self.batch_size, self.height, self.width, self.channels), dtype='float32') y = np.empty((self.batch_size), dtype='float32') # Generate data for i, ID in enumerate(list_IDs_temp): # Decompress image into pixel NumPy tensor image_file_name = self.images_path + '/'.join( ID.split('/')[1:]) + '.tar.gz' # Read image pixels = self.__get_data(image_file_name) # Store volume #pixels = np.rollaxis(pixels, 0, 3) # from 'channels_first' to 'channels_last' X[i, :, :, :] = pixels # get y value y_value = labels[ID] y[i] = y_value # return X and Y (train, validation) return X, y ''' Please note that Keras only accepts labels written in a binary form (in a 6-label problem, the third label is writtten [0 0 1 0 0 0]), which is why we need the sparsify function to perform this task, should y be a list of numerical values. ''' def sparsify1(self, y): 'Returns labels in binary NumPy array' return np.array([[1 if y[i] == j else 0 for j in range(10)] for i in range(y.shape[0])])
class Loader: """ The idea of the loader is to provide a convenient interface to create a new table based on some input files """ def __init__(self, path, name_node, hive_server, user="******", hive_db="default", password=None, nn_port=8020, hive_port=10000): # HDFS Connection self._client = Client(name_node, nn_port) self._db = hive_db # Hive Connection self._hive = pyhs2.connect(host=hive_server, port=hive_port, authMechanism="PLAIN", database=hive_db, user=user, password=password) self._path = path def load(self): # Check data to see which kind it is files = self._client.ls([self._path]) files = [f for f in files if f['file_type'] == 'f'] if len(files) == 0: raise Exception("Cannot load empty directory") # Pick the first file and assume that it has the same content as the others data = self.head(files[0]['path']) res = self.check_separator(data) if res == None: # We cant load the data and better abort here print("cant load data, cannot find a separator") return sep = res[0] num_cols = res[1] # Build table statement table_statement, table_name = self._create_table(self._path, sep, num_cols) cursor = self._hive.cursor() cursor.execute(table_statement) return self._db, table_name def _create_table(self, path, sep, count): buf = """CREATE EXTERNAL TABLE pyxplorer_data ( %s )ROW FORMAT DELIMITED FIELDS TERMINATED BY '%s' STORED AS TEXTFILE LOCATION '%s' """ % (",".join(["col_%d string" % x for x in range(count)]), sep, path) return buf, "pyxplorer_data" def check_separator(self, data): """ THis method evaluates a list of separators on the input data to check which one is correct. This is done by first splitting the input by newline and then checking if the split by separator is equal for each input row except the last that might be incomplete due to the limited input data :param data: input data to check :return: """ sep_list = [r'\t', r';', r',', r'|', r'\s+'] data_copy = data for sep in sep_list: # Check if the count matches each line splitted = data_copy.split("\n") parts = [len(re.split(sep, line)) for line in splitted] # If we did not split anything continue if sum(parts) == len(splitted): continue diff = 0 for i in range(len(parts[1:-1])): diff += abs(parts[i] - parts[i + 1]) if diff == 0: return sep, parts[0] # If we reach this point we did not find a separator return None def head(self, file_path): """ Onlye read the first packets that come, try to max out at 1024kb :return: up to 1024b of the first block of the file """ processor = lambda path, node, tail_only=True, append=False: self._handle_head( path, node) # Find items and go for item in self._client._find_items([file_path], processor, include_toplevel=True, include_children=False, recurse=False): if item: return item def _handle_head(self, path, node, upper=1024 * 1024): data = '' for load in self._client._read_file(path, node, tail_only=False, check_crc=False): data += load if (len(data) > upper): return data return data
from snakebite.client import Client from constants import * client = Client('localhost', NAMENODE_PORT) for p in client.delete(['/foo/bar','/input'], recurse=True): print p
import sys, string, getpass, time, datetime import happybase from snakebite.client import Client import pprint import urllib, json, ast, zlib, os hdfs = Client("ip-172-31-17-255") #for x in hdfs.ls(['/']): # print x hbase = happybase.Connection('localhost') hbase_settings_table = hbase.table('settings') ##get urls and add new ones if necessary #TODO:uncomment below # available_symbols_web = urllib.urlopen("http://api.bitcoincharts.com/v1/markets.json") # available_symbols = json.loads(available_symbols_web.read()) # csv_settings_urls = hbase_settings_table.row('bitcoin_csv', columns=['urls']) # known_symbols = [ key.split(':')[1] for key,val in csv_settings_urls.items() ] # load_dict = {} # for symbol in available_symbols: # if (symbol['symbol'] not in known_symbols): # load_dict['urls:' + symbol['symbol']] = str({'status':'', # 'symbol':symbol['symbol'], # 'url':'http://api.bitcoincharts.com/v1/trades.csv?symbol=' + symbol['symbol']}) #hbase_settings_table.put('bitcoin_csv', load_dict) def get_csv_file(hadoop_path, symbol, url): csv_data = urllib.urlopen(url)
def main(opts, args): hadoop_host = HADOOP_HOST hadoop_user_dir = None if opts.hdfs: print("hdfs enter") if opts.host: hadoop_host = opts.host hadoop_user_dir = opts.hdfs uni_gram_cnt = 0 bi_gram_cnt = 0 tri_gram_cnt = 0 four_gram_cnt = 0 five_gram_cnt = 0 result_buffer = [] source_input = None if not hadoop_user_dir: if len(args) > 2: source_input = sys.argv[1] else: source_input = sys.stdin for line in source_input: result_buffer.append(line) items = line.split() items_cnt = len(items) if items_cnt == 3: # 1-grams uni_gram_cnt +=1 elif items_cnt == 4: #2-grams bi_gram_cnt += 1 elif items_cnt == 5: #3-grams tri_gram_cnt += 1 elif items_cnt == 6: four_gram_cnt += 1 elif items_cnt == 7: five_gram_cnt +=1 else: print "connect to haddoop" hadoop_client = Client(hadoop_host, 8020, use_trash=False) for g in hadoop_client.cat([os.path.join(hadoop_user_dir, "*.txt")]): for line in g: result_buffer.append(line) items = line.split() items_cnt = len(items) if items_cnt == 3: # 1-grams uni_gram_cnt +=1 elif items_cnt == 4: #2-grams bi_gram_cnt += 1 elif items_cnt == 5: #3-grams tri_gram_cnt += 1 elif items_cnt == 6: four_gram_cnt += 1 elif items_cnt == 7: five_gram_cnt +=1 print('\\data\\') if uni_gram_cnt != 0: print("ngram 1=%s" % uni_gram_cnt) if bi_gram_cnt != 0: print("ngram 2=%s" % bi_gram_cnt) if tri_gram_cnt != 0: print("ngram 3=%s" % tri_gram_cnt) if four_gram_cnt != 0: print("ngram 4=%s" % four_gram_cnt) if five_gram_cnt != 0: print("ngram 5=%s" % five_gram_cnt) result_iter = iter(result_buffer) print print_ngram(result_iter, 1, uni_gram_cnt) print print_ngram(result_iter, 2, bi_gram_cnt) print print_ngram(result_iter, 3, tri_gram_cnt) print print("\\end\\")
class HDFS_topic(object): def __init__(self,topic,user,server,port,web_port,base,hdfs_tmp): self.topic = topic self.username = user self.server = server self.port = port self.base = base self.path = ["%s/%s" % (base,topic)] self.hdfs_tmp = hdfs_tmp try: self.client=Client(server,port,effective_user=user) self.hdfsclient=hdfs.client.InsecureClient(\ "http://%s:%d" % (server,web_port),user=user) self.daylist=self.check() except: print "Base path %s does not contain valid structure" % (base) raise # # Check basic hdfs access and that directory format is appropiate # also builds datelist structure # def check(self): self.content=self.client.ls(self.path) ret=[] for item in self.content: (head,tail) = os.path.split(item['path']) try: parse(tail,yearfirst=True,dayfirst=True) if item['file_type'] == 'd': ret.append(tail) else: print("WARNING: %s is not a directory, skipping\n" % (item['path'])) except: print("WARNING: %s is not in date format, skipping\n" % (tail)) if len(ret) > 0: ret.sort(key=lambda x: datetime.strptime(x,"%Y-%m-%d")) return ret else: return false # # Give a date, check if that date is on the dirlist and return matching dir entry # def day_in_topic(self, date): for item in self.daylist: if parse(date) == parse(item): return item return False # # Check and validates date_from and date_to arguments # def check_date_range(self,date_from,date_to): if date_from: try: parse(date_from) except: raise ValueError("FATAL: start date (%s) invalid date format" % (date_from) ) if ( parse(date_from) < parse(self.daylist[0]) ) or ( parse(date_from) > parse(self.daylist[-1]) ): raise ValueError("FATAL: start date (%s) not in range (%s ---> %s)" % (date_from,self.daylist[0],self.daylist[-1])) else: ret_from=parse(date_from).strftime("%Y-%m-%d") while not self.day_in_topic(ret_from): print "WARNING: start date %s not in topic %s, trying next day" % (ret_from,self.topic) ret_from=datetime.strftime((parse(ret_from)+timedelta(days=1)), "%Y-%m-%d" ) ret_from=self.day_in_topic(ret_from) else: ret_from=self.daylist[0] if date_to: try: parse(date_to) except: raise ValueError("FATAL: end date (%s) invalid date format" % (date_to) ) if ( parse(date_to) < parse(self.daylist[0]) ) or ( parse(date_to) > parse(self.daylist[-1]) ): raise ValueError("FATAL: end date (%s) not in range (%s ---> %s)" % (date_to,self.daylist[0],self.daylist[-1])) else: ret_to=parse(date_to).strftime("%Y-%m-%d") else: ret_to=self.daylist[-1] if (parse(ret_from) > parse(ret_to) ): raise ValueError("FATAL: start date (%s) must be <= end date (%s)" % (ret_from,ret_to)) return (ret_from,ret_to) # # Traverses the list of valid directories and merges each day # def merge(self,date_from="",date_to=""): day="" try: (day,date_to)=self.check_date_range(date_from,date_to) except Exception as err: raise ValueError(err) print "INFO: Trying to merge %s from %s to %s\n" % (self.topic,day, date_to) while (parse(day) <= parse(date_to)): if self.day_in_topic(day): self.merge_day(day) else: print "WARNING: %s is not on %s, skipping\n" % (day,self.path) day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" ) while not self.day_in_topic(day) and parse(day) <= parse(date_to): print "WARNING: %s not found in %s, trying next day" % (day,self.topic) day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" ) day=self.day_in_topic(day) if not day: return return True # # Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything # def merge_day(self,date): print "INFO: processing ", date daytmp="%s/snappymerge-%s-tmp" % (self.hdfs_tmp,date) daypath=["%s/%s/%s/" % (self.base, self.topic,date)] #mergedfile="./%s-merged.snappy" % (date) mergedfile="./%s-merged.snappy" % (datetime.strftime(datetime.now(),"%Y-%d-%m.%f")) day_files=[x['path'] for x in self.client.ls(daypath)] print "INFO: DAYPATH: ", daypath try: os.remove(mergedfile) except: pass if len([ x for x in day_files if x.endswith('.snappy') ]) <= 1: print "WARNING: %s does not have enough files to getmerge, skipping" % (date) return if [ file for file in day_files if not file.endswith('.snappy') ]: print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % (daypath,file,daytmp) self.merge_with_move(daypath[0],daytmp,day_files,mergedfile) else: print "INFO: MERGING ", daypath[0] result=self.client.getmerge(daypath[0],mergedfile) print [x for x in result if not x['result']] print "INFO: DELETING original files in ", daypath[0] for file in day_files: print "INFO: Deleting original file ", file self.hdfsclient.delete(file) print "INFO: UPLOADING merged (%s) to %s" % (mergedfile,daypath[0]) self.hdfsclient.upload(daypath[0],mergedfile,overwrite=True) os.remove(mergedfile) return # # When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads # def merge_with_move(self,day_path,day_tmp,dayfiles,merged_file): self.hdfsclient.makedirs(day_tmp) print "INFO: MOVING files to ", day_tmp snap = [x for x in dayfiles if x.endswith(".snappy")] result=self.client.rename(snap,day_tmp) print [ x['path'] for x in result if not x['result']] print "INFO: MERGING files in ", day_tmp result=self.client.getmerge(day_tmp,merged_file) print [x['path'] for x in result if not x['result']] print "INFO: UPLOADING merged (%s) to %s" % (merged_file,day_path) self.hdfsclient.upload(day_path,merged_file,overwrite=True) os.remove(merged_file) print "INFO: Deleting files on ", day_tmp self.hdfsclient.delete(day_tmp,recursive=True) if __name__ == '__main__' : import argparse count=0 parser = argparse.ArgumentParser(description="Merge daily historical snappy files into one to save hdfs space") parser.add_argument('topic', help="Topic name relative to --base") parser.add_argument('--hdfs_user', help="HDFS user name (default: current user)",default=None) parser.add_argument('--hdfs_server', help="HDFS server name or ip (default: aquhmstsys022001.c022.digitalriverws.net)",default="aquhmstsys022001.c022.digitalriverws.net") parser.add_argument('--hdfs_port', help="HDFS server port number (default:8020)", type=int, default=8020) parser.add_argument('--hdfs_tmp', help="HDFS temporary dir to store files to be merged (default:/user/hduser/tmp)", default="/user/hduser/tmp") parser.add_argument('--web_port', help="HDFS server WEB port number (default:50070)", type=int, default=50070) parser.add_argument('--base', help="Alternate hdfs base path for topic (default:/user/aqueduct/flume)",default="/user/aqueduct/flume") parser.add_argument('--start', help="Start Date inclusive (default: from beginning)") parser.add_argument('--end', help="End Date inclusive (default: to end)") args = parser.parse_args() topic=HDFS_topic(topic=args.topic,user=args.hdfs_user,server=args.hdfs_server,port=args.hdfs_port,\ hdfs_tmp=args.hdfs_tmp,web_port=args.web_port,base=args.base) try: topic.merge(args.start,args.end) except Exception as err: print err exit
from snakebite.client import Client client = Client('localhost', 54310) for l in client.text(['/input/input.txt']): print l
#using responses, cURL and snakebite to access Hadoop Windows Instance VM and put files into HDFS folder. import os import requests import json from snakebite.client import Client #hadoop connection client1 = Client('localhost', 19000) #Batch Twitter API connection endpoint = "https://api.twitter.com/1.1/tweets/search/fullarchive/HistoricalTweets.json" headers = { "Authorization": "Bearer xxxxxx", "Content-Type": "application/json" } # change your query here: data = '{"query":"(AI OR Artificial Intelligence OR Machine Learning)", "fromDate": "201602020000", "toDate": "201902240000" , "maxResults":10}' response = requests.post(endpoint, data=data, headers=headers).json() file = json.dumps(response, indent=2) # put downloaded data into local disk temporarily with open('data.txt', 'w') as outfile: json.dump(file, outfile) #for p in client1.mkdir(['/twitter_data_hist']): # print(p) #for x in client1.ls(['/']): # print(x) # this sends the file to hadoop cmd = "hdfs dfs -put C:\\Users\\user\\Desktop\\data_603_Twitter_API\\data.txt /twitter_data_hist/twitterhistpy.json" os.system(cmd) # this removes the file from local directory os.remove('data.txt')
def setUp(self): version = os.environ.get("HADOOP_PROTOCOL_VER", 7) self.cluster = self.__class__.cluster self.client = Client(self.cluster.host, self.cluster.port, int(version))
def get_locations(filename, name_host, name_port, **kwargs): client = Client(name_host, name_port, use_trash=False) files = list(client.ls([filename])) return [pair for file in files for pair in find(file, client, **kwargs)]
def setUp(self): self.custom_client = Client(self.cluster.host, self.cluster.port) self.custom_foobar_client = Client(host=self.cluster.host, port=self.cluster.port, effective_user='******')
import os from snakebite.client import Client # provide the Internet Process Communcation Port INTERNET_PROCESS_CIOMMUNICATION_PORT = "..." # provide the Name Node of Hadoop NAME_NODE = "..." # and get the client of HDFS CLIENT_HDFS = Client(NAME_NODE, INTERNET_PROCESS_CIOMMUNICATION_PORT) def read_hdfs_file(file_path_and_name) """Reads an hdfs file :param meta_info_file: the path and the file to read """ # 1. gets the hdfs file object for file_contents in CLIENT_HDFS.text([hdfs_file_name]): file_unicode = file_contents.decode('unicode-escape') file_obj = StringIO(file_unicode) # 2. read and operate on top: file_lines = get_hdfs_file_obj(meta_info_file).readlines() for line in file_lines: # ... # do operations on the file
def __init__(self, sc, doclist, ngram_range = [1,1], vocab = None, stop_words = None, nmin = None, nmax = None, num_partitions = None, features_max = None, tokenizer = alpha_tokenizer, hashing = False, load_path = None, hdfs_namenode = None) : self._sc = sc self._ngram_range = ngram_range self._vocab = vocab self._stop_words = stop_words self._nmin = nmin self._nmax = nmax self._num_partitions = num_partitions self._doclist = doclist self._features_max = features_max if features_max is not None else 2**31 self._tokenizer = tokenizer # initialie the RDDs self._doc_rdd = None self._ngram_rdd = None self._vocab_rdd = None self._docvec_rdd = None self._vocab_map_rdd = None # dictionary of RDDs self.rdds = {} # initialize other properties self._nfeatures = None self._hashing = hashing # make the vocabulary a set if it isn't one already if type(vocab) is not set and vocab is not None: try: self._vocab = set(vocab) except TypeError : raise TypeError("Vocabulary must be an iterable like a list, set, etc.") if load_path is not None : if load_path[:4] != 'hdfs' : for rdd_name in os.listdir(load_path) : if rdd_name[-3:] == 'rdd' : self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name) # we're dealing with HDFS else : try : from snakebite.client import Client except ImportError : raise ImportError("package snakebite is required for working with HDFS: pip install snakebite") if hdfs_namenode is None : # get the hadoop configuration files from user's environment and extract namenode import xml hadoop_conf = '%s/core-site.xml'%os.environ['HADOOP_CONF_DIR'] tree = xml.etree.ElementTree.parse(hadoop_conf) for prop in tree.findall('property') : if prop.find('name').text == 'fs.defaultFS' : dummy, hdfs_namenode, hdfs_port = prop.find('value').text.split(':') hdfs_namenode = hdfs_namenode[2:] break client = Client(hdfs_namenode, int(hdfs_port)) for rdd_path_dict in client.ls([load_path[7:]]) : rdd_name = rdd_path_dict['path'].split('/')[-1] if rdd_name[-3:] == 'rdd': self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name) print 'Loaded %d RDDs: '%(len(self.rdds)) for rdd in self.rdds.keys() : print rdd # make the vital properties dictionary for pickling self.properties = {'ngram_range': ngram_range, 'stop_words': stop_words, 'nmin': nmin, 'nmax': nmax, 'num_partitions': num_partitions, 'doclist': doclist, 'features_max': features_max, 'hashing': hashing, }
from snakebite.client import Client client = Client('localhost', 9000) for f in client.copyToLocal(['/input/input.txt'], '/tmp'): print f
from snakebite.client import Client client = Client('localhost', 54310) for p in client.mkdir(['/foo/bar', '/input'], create_parent=True): print p
def crfalign(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") def cpath(n): return os.path.join(crfConfigDir, n) smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"), config_path=cpath("eyeColor_config.txt")) smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"), config_path=cpath("hairColor_config.txt")) print smEyeColor, smHairColor if location == "hdfs": if deleteFirst: namenode = "memex-nn1" port = 8020 client = Client(namenode, 8020, use_trash=True) try: for deleted in client.delete([outputDirectory], recurse=True): print deleted except FileNotFoundException as e: pass # hypothesis1: data fetched this way prompts the lzo compression error # hypothesis2: but it doesn't matter, error is just a warning if partitions: if limit: rdd_crfl = sc.parallelize(rdd_crfl.take(limit)) rdd_crfl = rdd_crfl.repartition(partitions) else: print inputFilename rdd_crfl = sc.textFile(inputFilename, minPartitions=partitions) else: rdd_crfl = sc.textFile(inputFilename) rdd_crfl.setName('rdd_crfl') # rdd_crfl.persist() print "beginning: %s partitions" % rdd_crfl.getNumPartitions() # "value-only" RDD, not a pair RDD # but we have the URI in the -3 position # and the index in the -2 position rdd_withuri = rdd_crfl.map(lambda x: reconstructTuple(x)) # Note: groupByKey returns iterable, not data; so no point in printing rdd_grouped = rdd_withuri.groupByKey() # sort the vectors by index (within key groups) rdd_sorted = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))]) # find all contiguous spans of marked-up tokens # returns 0 or more dicts per URI key rdd_spans = rdd_sorted.mapValues(lambda x: computeSpans(x, indexed=True)) # flatten to (URI, single dict) on each line rdd_flat = rdd_spans.flatMapValues(lambda x: list(x)) # rdd_flat = rdd_flat.coalesce(rdd_flat.getNumPartitions() / 3) # # map any eyeColor spans using smEyeColor, hairType spans using smHairColor # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})) rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor.findBestMatch, "hairType": smHairColor.findBestMatch})) # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": fakeFindBestMatch, "hairType": fakeFindBestMatch})) # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {})) # rdd_aligned = rdd_spans # rdd_final = rdd_crfl rdd_final = rdd_aligned print outputFormat if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": print "saving to %s" % outputDirectory rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)