def __init__(self, input_uri, dst_uri): super(CopyPertFlow,self).__init__() #self.input = core.PertResource(self, input_uri, is_generated=False, check_exists=True) #self.AddInput('input', input) # every flow must have an input... this is a dummy input that will trigger the generation of the proto record self.input_uri = input_uri ok, scheme, path, error = py_pert.ParseUri(self.input_uri) input_basename = os.path.basename(path) self.AddOutput('output', core.PertResource(self, "%s/%s" % (dst_uri, input_basename) )) return
def Run(self): print 'pid: %s' % os.getpid() print 'id(py_pert): %s' % id(py_pert) ok, scheme, path = py_pert.ParseUri(self.uri) print 'path: %s' % path print 'exists: %s' % py_pert.Exists(self.uri) if py_pert.Exists(self.uri): print 'num shards: %s' % py_pert.GetNumShards(self.uri) super(MyOperation, self).Run() return True
def __CreateValidUriCacheString(self, uris_to_cache): #TODO(kheath): if hadoop pipes -files flag is passed a uri with fewer than /// slashes after scheme: it fails... This rewrites uris to deal with this bug. # Remove this hack one this bug if fixed in hadoop. clean_cache_uris = [] for uri in uris_to_cache: ok, scheme, path, error = py_pert.ParseUri(uri) CHECK(ok) CHECK_EQ(scheme, 'maprfs') CHECK(py_pert.Exists(uri), 'uri not there: %s' % uri) clean_cache_uris.append(py_pert.CanonicalizeUri(uri)) uri_cache_string = ','.join(clean_cache_uris) return uri_cache_string
def UriToNfsPath(uri): """ Converts a uri to the equivalent local path using the NFS local mount point. """ ok, scheme, path, error = py_pert.ParseUri(uri) assert (ok) nfs_path = None if scheme == 'local': nfs_path = path elif scheme == 'maprfs': nfs_path = '%s/%s' % (nfs_base, path) else: LOG(FATAL, 'unexpected scheme: %s' % scheme) return nfs_path
def __init__(self, pipes_binary, input_path, output_path, num_map_jobs, num_reduce_jobs, output_sorted=True, parameters=dict(), uris_to_cache=[], libjars=[]): self.pipes_binary = pipes_binary self.input_path = input_path self.output_path = output_path self.num_map_jobs = int(num_map_jobs) assert (self.num_map_jobs) self.num_reduce_jobs = num_reduce_jobs self.parameters = parameters self.libjars = EnsureList(libjars) # uris_to_cache can be either a single string or a list of strings # convert to list format here self.uris_to_cache = EnsureList(uris_to_cache) for uri in self.uris_to_cache: ok, scheme, path, error = py_pert.ParseUri(uri) CHECK(ok) CHECK_EQ(scheme, 'maprfs') # set snappy compression for output if none selected if not self.parameters.has_key( 'pert.recordwriter.compression_codec_name'): self.parameters[ 'pert.recordwriter.compression_codec_name'] = 'snappy' # set memcmp comparator if output should be sorted if output_sorted: if (self.parameters.has_key('pert.recordwriter.comparator_name')): logging.fatal( 'you requested sorted output but already set a comparator name: %s', self.parameters['pert.recordwriter.comparator_name']) self.parameters['pert.recordwriter.comparator_name'] = 'memcmp' else: self.parameters['pert.recordwriter.comparator_name'] = 'none' return
def GetCachedTideImageGraph(image_graph_uri, tide_uri): hash_str = '' for uri in [image_graph_uri, tide_uri]: ok, scheme, path, error = py_pert.ParseUri(tide_uri) hash_str += (uri + str(os.path.getmtime(path)) + str(os.path.getsize(path))) hash_val = hash(hash_str) cache_path = './tide_image_graph.%s.pickle' % (hash_val) tide_image_graph = None if os.path.exists(cache_path): f = open(cache_path, 'r') tide_image_graph = pickle.load(f) else: tide_image_graph = TideImageGraph(image_graph_uri, tide_uri) f = open(cache_path, 'w') pickle.dump(tide_image_graph, f, protocol=2) return tide_image_graph
def Run(self): print 'pid: %s' % os.getpid() print 'id(py_pert): %s' % id(py_pert) ok, scheme, path = py_pert.ParseUri(self.uri) print 'path: %s' % path print 'exists: %s' % py_pert.Exists(self.uri) if py_pert.Exists(self.uri): print 'num shards: %s' % py_pert.GetNumShards(self.uri) reader = py_pert.StringTableReader() print 'about to open reader' reader.Open(self.uri) print 'about to use reader' count = 0 for k, v in reader: print k count += 1 if count > 5: break return True
def MakeRunCommand(self): # check preconditions assert (self.input_path) assert (self.output_path) assert (self.pipes_binary) CHECK_NE(self.num_map_jobs, None) CHECK_NE(self.num_reduce_jobs, None) # TODO(kheath): pipes fails with no helpful error message when scheme prefix # is used. Is this expected? # Workaround for now is to strip off scheme prefix scheme_free_input_paths = [] for orig_input_path in self.input_path.split(','): ok, scheme, path, error = py_pert.ParseUri(orig_input_path) CHECK_EQ(scheme, "maprfs") CHECK(ok, error) scheme_free_input_paths.append(path) self.input_path = ','.join(scheme_free_input_paths) ok, scheme, self.output_path, error = py_pert.ParseUri( self.output_path) CHECK(ok, error) CHECK_EQ(scheme, "maprfs") remote_binary = '/data/deluge/bin/%s' % (self.pipes_binary) remote_binary_uri = 'maprfs://' + remote_binary CopyUri('local://' + self.pipes_binary, remote_binary_uri) # if profiler is turned on and a profiler_timeout is enabled, disable the failure redundancy so we get profile result faster if 'profiler' in self.parameters and self.parameters[ 'profiler'] == 'on' and 'profiler_timeout_sec' in self.parameters: self.parameters['mapred.map.max.attempts'] = '1' self.parameters['mapred.reduce.max.attempts'] = '1' if 'mapred.task.timeout' not in self.parameters: self.parameters['mapred.task.timeout'] = '1200000' # = 20 min self.parameters['mapred.map.tasks.speculative.execution'] = 'true' self.parameters['mapred.reduce.tasks.speculative.execution'] = 'true' self.parameters['mapred.compress.map.output'] = 'true' self.parameters[ 'mapred.map.output.compression.codec'] = 'org.apache.hadoop.io.compress.SnappyCodec' # These are required for c++ code that uses maprfs or hdfs # TODO(kheath): using this used to work but is broken now... not clear if the call to 'hadoop classpath' happens remotely or locally... as a work around... I manually ran it remotely hard coded it and things work again. for now... ;-) #self.parameters['mapred.map.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)' #self.parameters['mapred.reduce.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)' classpath_stuff = '/opt/mapr/hadoop/hadoop-0.20.2/bin/../conf:/usr/lib/jvm/java-6-sun/lib/tools.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/..:/opt/mapr/hadoop/hadoop-0.20.2/bin/../hadoop*core*.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/amazon-s3.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/asm-3.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/aspectjrt-1.6.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/aspectjtools-1.6.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/aws-java-sdk-1.3.26.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-cli-1.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-codec-1.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-configuration-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-daemon-1.0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-el-1.0.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-httpclient-3.0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-httpclient-3.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-lang-2.6.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-logging-1.0.4.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-logging-1.1.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-logging-api-1.0.4.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-math-2.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-net-1.4.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-net-3.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/core-3.1.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/emr-metrics-1.0.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/eval-0.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/gson-1.4.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/guava-13.0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hadoop-0.20.2-dev-capacity-scheduler.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hadoop-0.20.2-dev-core.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hadoop-0.20.2-dev-fairscheduler.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hsqldb-1.8.0.10.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/httpclient-4.1.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/httpcore-4.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jackson-core-asl-1.5.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jackson-mapper-asl-1.5.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jasper-compiler-5.5.12.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jasper-runtime-5.5.12.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jersey-core-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jersey-json-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jersey-server-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jets3t-0.6.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jetty-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jetty-servlet-tester-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jetty-util-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/junit-4.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/kfs-0.2.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/log4j-1.2.15.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/logging-0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/maprfs-0.20.2-2.1.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/maprfs-jni-0.20.2-2.1.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/mockito-all-1.8.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/mockito-all-1.8.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/mysql-connector-java-5.0.8-bin.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/oro-2.0.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/protobuf-java-2.4.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/servlet-api-2.5-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/slf4j-api-1.4.3.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/slf4j-log4j12-1.4.3.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/xmlenc-0.52.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/zookeeper-3.3.6.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jsp-2.1/jsp-2.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jsp-2.1/jsp-api-2.1.jar' self.parameters['mapred.map.child.env'] = 'CLASSPATH=$CLASSPATH:%s' % ( classpath_stuff) self.parameters[ 'mapred.reduce.child.env'] = 'CLASSPATH=$CLASSPATH:%s' % ( classpath_stuff) binary_name = os.path.basename(self.pipes_binary) job_name = '%s_%d' % (binary_name, time.time()) if self.uris_to_cache: cache_job = deluge_pb2.MaprDistributedCacheJob() cache_job.name = job_name cache_job.uris.extend(self.uris_to_cache) rack_topology = GetRackTopology() for rack_topology, ips in rack_topology.iteritems(): new_rack = cache_job.racks.add() new_rack.topology = rack_topology new_rack.member_ips.extend(ips) self.parameters['mapr_distributed_cache_job'] = Base64EncodeProto( cache_job) libjars_uris = [] for local_libjar_path in self.libjars: remote_jar = '/data/deluge/jar/%s' % (local_libjar_path) remote_jar_uri = 'maprfs://' + remote_jar print 'uploading jar: %s' % remote_jar_uri CopyUri('local://' + local_libjar_path, remote_jar_uri) libjars_uris.append(remote_jar_uri) cmd = 'hadoop pipes ' if libjars_uris: libjars_string = self.__CreateValidUriCacheString(libjars_uris) cmd += '-libjars %s ' % (libjars_string) cmd += '-D mapred.job.name=%s ' % (job_name) cmd += '-D mapred.job.reuse.jvm.num.tasks=10 ' # reuse a JVM at most N times cmd += '-D mapred.map.tasks=%d ' % (self.num_map_jobs) for k, v in self.parameters.iteritems(): if not isinstance(k, basestring): LOG(FATAL, 'expected a string but got: %s' % k) cmd += '-D %s=%s ' % (k, str(v)) cmd += '-program %s ' % (remote_binary_uri) cmd += '-input %s ' % (self.input_path) cmd += '-output %s ' % (self.output_path) cmd += '-reduces %d ' % (self.num_reduce_jobs) return cmd
def MakeRunCommand(self): # check preconditions assert (self.input_path) assert (self.output_path) assert (self.pipes_binary) CHECK_NE(self.num_map_jobs, None) CHECK_NE(self.num_reduce_jobs, None) # TODO(kheath): pipes fails with no helpful error message when scheme prefix # is used. Is this expected? # Workaround for now is to strip off scheme prefix scheme_free_input_paths = [] for orig_input_path in self.input_path.split(','): ok, scheme, path, error = py_pert.ParseUri(orig_input_path) CHECK_EQ(scheme, "maprfs") CHECK(ok, error) scheme_free_input_paths.append(path) self.input_path = ','.join(scheme_free_input_paths) ok, scheme, self.output_path, error = py_pert.ParseUri( self.output_path) CHECK(ok, error) CHECK_EQ(scheme, "maprfs") remote_binary = '/data/deluge/bin/%s' % (self.pipes_binary) remote_binary_uri = 'maprfs://' + remote_binary CopyUri('local://' + self.pipes_binary, remote_binary_uri) # if profiler is turned on and a profiler_timeout is enabled, disable the failure redundancy so we get profile result faster if 'profiler' in self.parameters and self.parameters[ 'profiler'] == 'on' and 'profiler_timeout_sec' in self.parameters: self.parameters['mapred.map.max.attempts'] = '1' self.parameters['mapred.reduce.max.attempts'] = '1' if 'mapred.task.timeout' not in self.parameters: self.parameters['mapred.task.timeout'] = '1200000' # = 20 min self.parameters['mapred.map.tasks.speculative.execution'] = 'true' self.parameters['mapred.reduce.tasks.speculative.execution'] = 'true' self.parameters['mapred.compress.map.output'] = 'true' self.parameters[ 'mapred.map.output.compression.codec'] = 'org.apache.hadoop.io.compress.SnappyCodec' # These are required for c++ code that uses maprfs or hdfs self.parameters[ 'mapred.map.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)' self.parameters[ 'mapred.reduce.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)' binary_name = os.path.basename(self.pipes_binary) job_name = '%s_%d' % (binary_name, time.time()) libjars_uris = [] for local_libjar_path in self.libjars: remote_jar = '/data/deluge/jar/%s' % (local_libjar_path) remote_jar_uri = 'maprfs://' + remote_jar print 'uploading jar: %s' % remote_jar_uri CopyUri('local://' + local_libjar_path, remote_jar_uri) libjars_uris.append(remote_jar_uri) cmd = 'hadoop pipes ' if self.uris_to_cache: uri_cache_string = self.__CreateValidUriCacheString( self.uris_to_cache) cmd += '-files %s ' % ( uri_cache_string ) # these get added to the hadoop distributed cache if libjars_uris: libjars_string = self.__CreateValidUriCacheString(libjars_uris) cmd += '-libjars %s ' % (libjars_string) cmd += '-D mapred.job.name=%s ' % (job_name) cmd += '-D mapred.job.reuse.jvm.num.tasks=10 ' # reuse a JVM at most N times cmd += '-D mapred.map.tasks=%d ' % (self.num_map_jobs) for k, v in self.parameters.iteritems(): if not isinstance(k, basestring): LOG(FATAL, 'expected a string but got: %s' % k) cmd += '-D %s=%s ' % (k, str(v)) cmd += '-program %s ' % (remote_binary_uri) cmd += '-input %s ' % (self.input_path) cmd += '-output %s ' % (self.output_path) cmd += '-reduces %d ' % (self.num_reduce_jobs) return cmd