Example #1
0
File: util.py Project: heathkh/iwct
 def __init__(self, input_uri, dst_uri):
   super(CopyPertFlow,self).__init__()
   #self.input = core.PertResource(self, input_uri, is_generated=False, check_exists=True)
   #self.AddInput('input', input)  # every flow must have an input... this is a dummy input that will trigger the generation of the proto record
   self.input_uri = input_uri
   ok, scheme, path, error = py_pert.ParseUri(self.input_uri)
   input_basename = os.path.basename(path) 
   self.AddOutput('output', core.PertResource(self, "%s/%s" % (dst_uri, input_basename) ))    
   return
Example #2
0
    def Run(self):

        print 'pid: %s' % os.getpid()
        print 'id(py_pert): %s' % id(py_pert)
        ok, scheme, path = py_pert.ParseUri(self.uri)
        print 'path: %s' % path
        print 'exists: %s' % py_pert.Exists(self.uri)
        if py_pert.Exists(self.uri):
            print 'num shards: %s' % py_pert.GetNumShards(self.uri)

        super(MyOperation, self).Run()

        return True
Example #3
0
    def __CreateValidUriCacheString(self, uris_to_cache):
        #TODO(kheath): if hadoop pipes -files flag is passed a uri with fewer than /// slashes after scheme: it fails... This rewrites uris to deal with this bug.
        # Remove this hack one this bug if fixed in hadoop.
        clean_cache_uris = []
        for uri in uris_to_cache:
            ok, scheme, path, error = py_pert.ParseUri(uri)
            CHECK(ok)
            CHECK_EQ(scheme, 'maprfs')
            CHECK(py_pert.Exists(uri), 'uri not there: %s' % uri)
            clean_cache_uris.append(py_pert.CanonicalizeUri(uri))

        uri_cache_string = ','.join(clean_cache_uris)
        return uri_cache_string
Example #4
0
def UriToNfsPath(uri):
    """
  Converts a uri to the equivalent local path using the NFS local mount point. 
  """
    ok, scheme, path, error = py_pert.ParseUri(uri)
    assert (ok)
    nfs_path = None
    if scheme == 'local':
        nfs_path = path
    elif scheme == 'maprfs':
        nfs_path = '%s/%s' % (nfs_base, path)
    else:
        LOG(FATAL, 'unexpected scheme: %s' % scheme)

    return nfs_path
Example #5
0
    def __init__(self,
                 pipes_binary,
                 input_path,
                 output_path,
                 num_map_jobs,
                 num_reduce_jobs,
                 output_sorted=True,
                 parameters=dict(),
                 uris_to_cache=[],
                 libjars=[]):
        self.pipes_binary = pipes_binary
        self.input_path = input_path
        self.output_path = output_path
        self.num_map_jobs = int(num_map_jobs)
        assert (self.num_map_jobs)
        self.num_reduce_jobs = num_reduce_jobs
        self.parameters = parameters
        self.libjars = EnsureList(libjars)

        # uris_to_cache can be either a single string or a list of strings
        # convert to list format here
        self.uris_to_cache = EnsureList(uris_to_cache)

        for uri in self.uris_to_cache:
            ok, scheme, path, error = py_pert.ParseUri(uri)
            CHECK(ok)
            CHECK_EQ(scheme, 'maprfs')

        # set snappy compression for output if none selected
        if not self.parameters.has_key(
                'pert.recordwriter.compression_codec_name'):
            self.parameters[
                'pert.recordwriter.compression_codec_name'] = 'snappy'

        # set memcmp comparator if output should be sorted
        if output_sorted:
            if (self.parameters.has_key('pert.recordwriter.comparator_name')):
                logging.fatal(
                    'you requested sorted output but already set a comparator name: %s',
                    self.parameters['pert.recordwriter.comparator_name'])
            self.parameters['pert.recordwriter.comparator_name'] = 'memcmp'
        else:
            self.parameters['pert.recordwriter.comparator_name'] = 'none'

        return
Example #6
0
def GetCachedTideImageGraph(image_graph_uri, tide_uri):
    hash_str = ''
    for uri in [image_graph_uri, tide_uri]:
        ok, scheme, path, error = py_pert.ParseUri(tide_uri)
        hash_str += (uri + str(os.path.getmtime(path)) +
                     str(os.path.getsize(path)))
    hash_val = hash(hash_str)
    cache_path = './tide_image_graph.%s.pickle' % (hash_val)

    tide_image_graph = None
    if os.path.exists(cache_path):
        f = open(cache_path, 'r')
        tide_image_graph = pickle.load(f)
    else:
        tide_image_graph = TideImageGraph(image_graph_uri, tide_uri)
        f = open(cache_path, 'w')
        pickle.dump(tide_image_graph, f, protocol=2)
    return tide_image_graph
Example #7
0
    def Run(self):
        print 'pid: %s' % os.getpid()
        print 'id(py_pert): %s' % id(py_pert)
        ok, scheme, path = py_pert.ParseUri(self.uri)
        print 'path: %s' % path
        print 'exists: %s' % py_pert.Exists(self.uri)
        if py_pert.Exists(self.uri):
            print 'num shards: %s' % py_pert.GetNumShards(self.uri)
            reader = py_pert.StringTableReader()
            print 'about to open reader'
            reader.Open(self.uri)
            print 'about to use reader'
            count = 0
            for k, v in reader:
                print k
                count += 1
                if count > 5:
                    break

        return True
Example #8
0
    def MakeRunCommand(self):
        # check preconditions
        assert (self.input_path)
        assert (self.output_path)
        assert (self.pipes_binary)
        CHECK_NE(self.num_map_jobs, None)
        CHECK_NE(self.num_reduce_jobs, None)

        # TODO(kheath): pipes fails with no helpful error message when scheme prefix
        # is used.  Is this expected?
        # Workaround for now is to strip off scheme prefix
        scheme_free_input_paths = []
        for orig_input_path in self.input_path.split(','):
            ok, scheme, path, error = py_pert.ParseUri(orig_input_path)
            CHECK_EQ(scheme, "maprfs")
            CHECK(ok, error)
            scheme_free_input_paths.append(path)
        self.input_path = ','.join(scheme_free_input_paths)

        ok, scheme, self.output_path, error = py_pert.ParseUri(
            self.output_path)
        CHECK(ok, error)
        CHECK_EQ(scheme, "maprfs")

        remote_binary = '/data/deluge/bin/%s' % (self.pipes_binary)
        remote_binary_uri = 'maprfs://' + remote_binary
        CopyUri('local://' + self.pipes_binary, remote_binary_uri)
        # if profiler is turned on and a profiler_timeout is enabled, disable the failure redundancy so we get profile result faster
        if 'profiler' in self.parameters and self.parameters[
                'profiler'] == 'on' and 'profiler_timeout_sec' in self.parameters:
            self.parameters['mapred.map.max.attempts'] = '1'
            self.parameters['mapred.reduce.max.attempts'] = '1'

        if 'mapred.task.timeout' not in self.parameters:
            self.parameters['mapred.task.timeout'] = '1200000'  #  = 20 min

        self.parameters['mapred.map.tasks.speculative.execution'] = 'true'
        self.parameters['mapred.reduce.tasks.speculative.execution'] = 'true'
        self.parameters['mapred.compress.map.output'] = 'true'
        self.parameters[
            'mapred.map.output.compression.codec'] = 'org.apache.hadoop.io.compress.SnappyCodec'

        # These are required for c++ code that uses maprfs or hdfs

        # TODO(kheath): using this used to work but is broken now... not clear if the call to 'hadoop classpath' happens remotely or locally... as a work around... I manually ran it remotely hard coded it and things work again.  for now... ;-)
        #self.parameters['mapred.map.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)'
        #self.parameters['mapred.reduce.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)'

        classpath_stuff = '/opt/mapr/hadoop/hadoop-0.20.2/bin/../conf:/usr/lib/jvm/java-6-sun/lib/tools.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/..:/opt/mapr/hadoop/hadoop-0.20.2/bin/../hadoop*core*.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/amazon-s3.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/asm-3.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/aspectjrt-1.6.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/aspectjtools-1.6.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/aws-java-sdk-1.3.26.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-cli-1.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-codec-1.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-configuration-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-daemon-1.0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-el-1.0.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-httpclient-3.0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-httpclient-3.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-lang-2.6.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-logging-1.0.4.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-logging-1.1.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-logging-api-1.0.4.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-math-2.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-net-1.4.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/commons-net-3.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/core-3.1.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/emr-metrics-1.0.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/eval-0.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/gson-1.4.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/guava-13.0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hadoop-0.20.2-dev-capacity-scheduler.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hadoop-0.20.2-dev-core.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hadoop-0.20.2-dev-fairscheduler.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/hsqldb-1.8.0.10.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/httpclient-4.1.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/httpcore-4.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jackson-core-asl-1.5.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jackson-mapper-asl-1.5.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jasper-compiler-5.5.12.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jasper-runtime-5.5.12.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jersey-core-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jersey-json-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jersey-server-1.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jets3t-0.6.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jetty-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jetty-servlet-tester-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jetty-util-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/junit-4.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/kfs-0.2.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/log4j-1.2.15.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/logging-0.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/maprfs-0.20.2-2.1.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/maprfs-jni-0.20.2-2.1.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/mockito-all-1.8.2.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/mockito-all-1.8.5.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/mysql-connector-java-5.0.8-bin.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/oro-2.0.8.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/protobuf-java-2.4.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/servlet-api-2.5-6.1.14.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/slf4j-api-1.4.3.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/slf4j-log4j12-1.4.3.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/xmlenc-0.52.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/zookeeper-3.3.6.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jsp-2.1/jsp-2.1.jar:/opt/mapr/hadoop/hadoop-0.20.2/bin/../lib/jsp-2.1/jsp-api-2.1.jar'
        self.parameters['mapred.map.child.env'] = 'CLASSPATH=$CLASSPATH:%s' % (
            classpath_stuff)
        self.parameters[
            'mapred.reduce.child.env'] = 'CLASSPATH=$CLASSPATH:%s' % (
                classpath_stuff)

        binary_name = os.path.basename(self.pipes_binary)
        job_name = '%s_%d' % (binary_name, time.time())

        if self.uris_to_cache:
            cache_job = deluge_pb2.MaprDistributedCacheJob()
            cache_job.name = job_name
            cache_job.uris.extend(self.uris_to_cache)

            rack_topology = GetRackTopology()
            for rack_topology, ips in rack_topology.iteritems():
                new_rack = cache_job.racks.add()
                new_rack.topology = rack_topology
                new_rack.member_ips.extend(ips)

            self.parameters['mapr_distributed_cache_job'] = Base64EncodeProto(
                cache_job)

        libjars_uris = []

        for local_libjar_path in self.libjars:
            remote_jar = '/data/deluge/jar/%s' % (local_libjar_path)
            remote_jar_uri = 'maprfs://' + remote_jar
            print 'uploading jar: %s' % remote_jar_uri
            CopyUri('local://' + local_libjar_path, remote_jar_uri)
            libjars_uris.append(remote_jar_uri)

        cmd = 'hadoop pipes '

        if libjars_uris:
            libjars_string = self.__CreateValidUriCacheString(libjars_uris)
            cmd += '-libjars %s ' % (libjars_string)

        cmd += '-D mapred.job.name=%s ' % (job_name)
        cmd += '-D mapred.job.reuse.jvm.num.tasks=10 '  # reuse a JVM at most N times
        cmd += '-D mapred.map.tasks=%d ' % (self.num_map_jobs)
        for k, v in self.parameters.iteritems():
            if not isinstance(k, basestring):
                LOG(FATAL, 'expected a string but got: %s' % k)
            cmd += '-D %s=%s ' % (k, str(v))
        cmd += '-program %s ' % (remote_binary_uri)
        cmd += '-input %s ' % (self.input_path)
        cmd += '-output %s ' % (self.output_path)
        cmd += '-reduces %d ' % (self.num_reduce_jobs)

        return cmd
Example #9
0
    def MakeRunCommand(self):
        # check preconditions
        assert (self.input_path)
        assert (self.output_path)
        assert (self.pipes_binary)
        CHECK_NE(self.num_map_jobs, None)
        CHECK_NE(self.num_reduce_jobs, None)

        # TODO(kheath): pipes fails with no helpful error message when scheme prefix
        # is used.  Is this expected?
        # Workaround for now is to strip off scheme prefix
        scheme_free_input_paths = []
        for orig_input_path in self.input_path.split(','):
            ok, scheme, path, error = py_pert.ParseUri(orig_input_path)
            CHECK_EQ(scheme, "maprfs")
            CHECK(ok, error)
            scheme_free_input_paths.append(path)
        self.input_path = ','.join(scheme_free_input_paths)

        ok, scheme, self.output_path, error = py_pert.ParseUri(
            self.output_path)
        CHECK(ok, error)
        CHECK_EQ(scheme, "maprfs")

        remote_binary = '/data/deluge/bin/%s' % (self.pipes_binary)
        remote_binary_uri = 'maprfs://' + remote_binary
        CopyUri('local://' + self.pipes_binary, remote_binary_uri)
        # if profiler is turned on and a profiler_timeout is enabled, disable the failure redundancy so we get profile result faster
        if 'profiler' in self.parameters and self.parameters[
                'profiler'] == 'on' and 'profiler_timeout_sec' in self.parameters:
            self.parameters['mapred.map.max.attempts'] = '1'
            self.parameters['mapred.reduce.max.attempts'] = '1'

        if 'mapred.task.timeout' not in self.parameters:
            self.parameters['mapred.task.timeout'] = '1200000'  #  = 20 min

        self.parameters['mapred.map.tasks.speculative.execution'] = 'true'
        self.parameters['mapred.reduce.tasks.speculative.execution'] = 'true'
        self.parameters['mapred.compress.map.output'] = 'true'
        self.parameters[
            'mapred.map.output.compression.codec'] = 'org.apache.hadoop.io.compress.SnappyCodec'

        # These are required for c++ code that uses maprfs or hdfs
        self.parameters[
            'mapred.map.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)'
        self.parameters[
            'mapred.reduce.child.env'] = 'CLASSPATH=$CLASSPATH:$(hadoop classpath)'

        binary_name = os.path.basename(self.pipes_binary)
        job_name = '%s_%d' % (binary_name, time.time())

        libjars_uris = []

        for local_libjar_path in self.libjars:
            remote_jar = '/data/deluge/jar/%s' % (local_libjar_path)
            remote_jar_uri = 'maprfs://' + remote_jar
            print 'uploading jar: %s' % remote_jar_uri
            CopyUri('local://' + local_libjar_path, remote_jar_uri)
            libjars_uris.append(remote_jar_uri)

        cmd = 'hadoop pipes '

        if self.uris_to_cache:
            uri_cache_string = self.__CreateValidUriCacheString(
                self.uris_to_cache)
            cmd += '-files %s ' % (
                uri_cache_string
            )  # these get added to the hadoop distributed cache

        if libjars_uris:
            libjars_string = self.__CreateValidUriCacheString(libjars_uris)
            cmd += '-libjars %s ' % (libjars_string)

        cmd += '-D mapred.job.name=%s ' % (job_name)
        cmd += '-D mapred.job.reuse.jvm.num.tasks=10 '  # reuse a JVM at most N times
        cmd += '-D mapred.map.tasks=%d ' % (self.num_map_jobs)
        for k, v in self.parameters.iteritems():
            if not isinstance(k, basestring):
                LOG(FATAL, 'expected a string but got: %s' % k)
            cmd += '-D %s=%s ' % (k, str(v))
        cmd += '-program %s ' % (remote_binary_uri)
        cmd += '-input %s ' % (self.input_path)
        cmd += '-output %s ' % (self.output_path)
        cmd += '-reduces %d ' % (self.num_reduce_jobs)

        return cmd