Ejemplo n.º 1
0
def _work(job_queue, result_queue, remote):
    while True:
        job = None
        try:
            job = _pickle.loads(base64.b64decode(job_queue.get()))
            if remote:
                if hasattr(type(job), package_path):
                    if not os.path.exists(job.package_path):
                        fetch(job.package_path)
                for input in job.task.input.values():
                    if isinstance(input, list):
                        input_list = input
                    else:
                        input_list = [input]
                    for file in input_list:
                        if not os.path.exists(file):
                            fetch(file)

            result = job.run()
            result_queue.put((job.id, _pickle.dumps(result)))
        except KeyboardInterrupt:
            pass
        except Exception as e:
            result_id = job.id if job is not None else None
            traceback.print_exc()
            e.traceback = traceback.format_exc()
            result_queue.put((result_id, _pickle.dumps(e)))
Ejemplo n.º 2
0
def _work(job_queue, result_queue):
    while True:
        job = None
        try:
            job = _pickle.loads(job_queue.get())
            # need something to create this task and mape arguments properly
            result = job.run()
            result_queue.put((job.id, _pickle.dumps(result)))
        except KeyboardInterrupt:
            pass
        except Exception as e:
            result_id = job.id if job is not None else None
            traceback.print_exc()
            e.traceback = traceback.format_exc()
            result_queue.put((result_id, _pickle.dumps(e)))
Ejemplo n.º 3
0
    def to_s3(self, obj, bucket_id, key, protocol=0, **kwargs):
        """
        Save object to Amazon S3 as string representation
        
        Parameters:
        obj: the Pandas object to be saved
        conn: an S3 connection or S3 bucket object
        bucket_id: the name of the bucket where the object is located
        key: the key to assign to the saved object
        protocol: passed to pickle.dumps
        **kwargs: passed to pandas boto `set_contents_from_string`
        
        """

        if type(obj) == str:
            obj_string = zlib.compress(obj)
        else:
            try:
                obj_string = zlib.compress(dumps(obj, protocol))
            except:
                raise Exception("obj could not be pickled")

        try:
            bucket = self.conn_s3.get_bucket(bucket_id)
        except:
            bucket = self.conn_s3.create_bucket(bucket_id)

        k = Key(bucket)
        k.key = key
        _ = k.set_contents_from_string(obj_string, **kwargs)

        return key
Ejemplo n.º 4
0
    def register(self, function, server):
        """
        Register the function with the server.

        Returns a function id used by the local invoker to proxy the call.
        """
        registrar_uri = "PYRO:[email protected]{0}:8007".format(server)
        remote_registrar = Pyro4.Proxy(registrar_uri)
        pickled_function = cloudpickle.dumps(function)

        try:
            if self._metadata_manager is None:
                self._metadata_manager = remote_registrar.get_metadata()
            # Check to see if function has been registered already
            pickled_function_hash = hash(pickled_function)
            if pickled_function_hash in self._metadata_manager:
                return self._metadata_manager[pickled_function_hash]
            else:
                # If not registered already, register it
                return remote_registrar.deserialize_and_register(function.func_name, pickled_function)
        except Pyro4.errors.CommunicationError as comm_error:
            print "Communication error:", comm_error
            print "Did you run the easyrpc start script?"
            sys.exit(1)
        except Exception as reg_error:
            print "Remote registration error:", reg_error
            sys.exit(1)
Ejemplo n.º 5
0
 def pickle(self, job):
     try:
         pickled = _pickle.dumps(job)
         return pickled
     except Exception as e:
         self.results[job.id] = e
         job.failed = True
         self.completed_jobs[job.id] = job
         self.fail_downstream(job)
         return None
Ejemplo n.º 6
0
    def dispatch(self, job):
        try:
            self.job_queue.put(_pickle.dumps(job))
            self.pending_jobs[job.id] = job
        except Exception as e:
            self.results[job.id] = e
            job.failed = True
            self.completed_jobs[job.id] = job
            self.fail_downstream(job)

        del self.candidate_jobs[job.id]
        self.num_cores_in_use += job.num_cores
Ejemplo n.º 7
0
 def map_sync(self, f, *args):
     res = []
     exceptions = []
     for group in zip(*args):
         # simulate network roundtrip
         group = loads(dumps(group))
         tries = 0
         while tries < self.tries:
             tries += 1
             try:
                 res.append(f(*group))
             except:
                 if tries < self.tries:
                     pass
                 else:
                     # lol
                     exceptions.append(unwrap_exception(wrap_exception()))
                     break
     if exceptions:
         raise CompositeError("Mock Composite error", exceptions)
     else:
         return res
Ejemplo n.º 8
0
def main(load, train):
    if train:
        # open the tmx file
        tmx_file = open('../memoire_en-US_es-ES.tmx', 'r')
        # Parse the TMX file into python objects
        tmx_tree = etree.parse(tmx_file)
        # create a list of the "seg" elements, where our segments are contained
        tree = [e for e in tmx_tree.iter("seg")]
        # Pair these segments up and put them into a list
        pairs = zip(tree, tree[1:])[::2]
        corpus = []
        count = 0
        # import the source and target language stopwords
        stopwords_en = nltk.corpus.stopwords.words('english')
        stopwords_es = nltk.corpus.stopwords.words('spanish')
        # iterate through the segment pairs
        for e, k in pairs:
            # tuid = el.getparent().getparent().attrib['tuid']
            # eliminate any segments that have non alpha terms and terms which are 1 char long
            if (e.text and len(e.text) > 1) and (k.text and len(k.text) > 1):
                # split the words of the segment into a list. Lowercase all the tokens and elimate any stopwords
                e_token = [w.lower() for w in nltk.word_tokenize(unicode(e.text)) if w.lower() not in stopwords_en]
                k_token = [x.lower() for x in nltk.word_tokenize(unicode(k.text)) if x.lower() not in stopwords_es]
                # add the token list the corpus
                if ''.join(e_token).isalpha() and ''.join(k_token).isalpha():
                    if len(e_token) > 0 and len(k_token) > 0:
                        # print e.text, e_token, k_token
                        # if 'articles' in e_token:
                            # print e.text, e_token, k_token
                        corpus.append(AlignedSent(e_token, k_token, tuid=e.getparent().getparent().attrib['tuid']))

        for e, k in pairs:
            # eliminate any segments that have non alpha terms and terms which are 1 char long
            if (e.text and len(e.text) > 1) and (k.text and len(k.text) > 1):
                # split the words of the segment into a list. Lowercase all the tokens and elimate any stopwords
                e_token = [w.lower() for w in nltk.word_tokenize(unicode(e.text)) if w.lower() not in stopwords_en]
                k_token = [x.lower() for x in nltk.word_tokenize(unicode(k.text)) if x.lower() not in stopwords_es]
                # add the token list the corpus
                if ''.join(e_token).isalpha() and ''.join(k_token).isalpha():
                    if len(e_token) > 0 and len(k_token) > 0:
                        source_out, target_out = pos_realign(" ".join(e_token), " ".join(k_token))
                        corpus.append(AlignedSent(source_out, target_out, tuid=e.getparent().getparent().attrib['tuid']))

        # train the aligned corpus to figure out which pairs of words match
        model = dumps(IBMModel2(corpus, 1))
        with open("models/ibm2.p", "w") as dest:
            dest.write(model)
        result = dumps(corpus)
        with open("models/corpus", "w") as dest:
            dest.write(result)
    else:
        with open("models/ibm2.p") as source:
            result = source.read()
        model = loads(result)
        with open("models/corpus") as source:
            result = source.read()
        corpus = loads(result)
    if csv:
        # iterate through the model
        print "%s,%s,%s" % ("source", "target", "precision")
        for k, v in model.probabilities.items():
            # print the term pair if the precision is >= 0.5 and not the same string
            if max(v.values()) >= 0.5:
                if k.encode('utf-8') != max(v, key=v.get).encode('utf-8'):
                    print "%s,%s,%s" % (k.encode('utf-8'), max(v, key=v.get).encode('utf-8'), max(v.values()))
    # get sent alignments with TUID property
    if debug:
        aligned_corpus = {}
        for sent in corpus:
            aligned = model.align(sent)
            for word in aligned.words:
                v = model.probabilities[word]
                if word not in aligned_corpus:
                    if (max(v.values()) >= 0.4) and max(v, key=v.get):
                        if word.encode('utf-8') != max(v, key=v.get).encode('utf-8'):
                            aligned_corpus[word] = {}
                            aligned_corpus[word]['target'] = max(v, key=v.get).encode('utf-8')
                            aligned_corpus[word]['tuid'] = aligned.tuid
                            aligned_corpus[word]['precision'] = max(v.values())
                            aligned_corpus[word]['source_sent'] = " ".join(aligned.words).encode('utf-8')
                            aligned_corpus[word]['target_sent'] = " ".join(aligned.mots).encode('utf-8')
                            aligned_corpus[word]['alignment'] = str(aligned.alignment)
        print json.dumps(aligned_corpus, indent=4, sort_keys=False)
Ejemplo n.º 9
0
    def create_script(self, func, func_kwargs=None, mask="anaconda", aptget=None, custom=None):
        """
        Create a custom python script to run a function on EC2.
        
        Parameters
        __________
        
        * func: an arbitrary function
        * bucket_id: the name of the s3 bucket to which to load func results
        * func_kwargs: a dictionary of keyword arguments to feed to func
        * mask: a key in lib_dicts indicating which Python modules should be
        assumed loaded on EC2
        * apt_get: a list of package names to install on EC2 via apt-get (not 
        tested)
        * custom: a custom script to run in the EC2 shell before starting 
        python (not tested)
        
        """

        # get function dependencies
        setup_specs = self.get_objects(func, mask=mask)

        # start script
        script = "#!/usr/bin/env python\n"

        # always install these modules
        script += "try:\n    import cPickle as pickle\n"
        script += "except:\n    import pickle\n\n"
        script += "try:\n    from cStringIO import StringIO\n"
        script += "except:\n    from StringIO import StringIO\n\n"
        script += "import sys, os, time, uuid, inspect, imp, dis, subprocess, zlib\n\n"
        script += "from cloud.serialization.cloudpickle import dumps\n"
        script += "import boto.s3.connection as s3\n"
        script += "from boto.s3.key import Key\n"
        script += "def quotes(s, remove=True):\n"
        script += "    empty = chr(32)[:0]\n"
        script += "    double = [100, 111, 117, 98, 108, 101]\n"
        script += "    middle = [95, 95, 95, 113, 95, 95, 95]\n"
        script += "    single = [115, 105, 110, 103, 108, 101]\n"
        script += "    grave = [103, 114, 97, 118, 101]\n"
        script += "    double_r = empty.join([chr(x) for x in double+middle+double])\n"
        script += "    single_r = empty.join([chr(x) for x in single+middle+single])\n"
        script += "    grave_r = empty.join([chr(x) for x in grave+middle+grave])\n"
        script += "    if remove:\n"
        script += "        return s.replace(chr(34), double_r).replace(chr(39), single_r\n"
        script += "            ).replace(chr(96), grave_r)\n"
        script += "    else:\n"
        script += "        return s.replace(double_r, chr(34)).replace(single_r, chr(39)\n"
        script += "            ).replace(grave_r, chr(96))\n\n"
        script += "def manual_module(path, name, from_string=True):\n"
        script += "    new_module = imp.new_module(name)\n"
        script += "    if not from_string:\n"
        script += "        with open(path, 'r') as f:\n"
        script += "            new_module_code = f.read()\n"
        script += "    else:\n"
        script += "        new_module_code = path\n"
        script += "    new_module_code = new_module_code\n"
        script += "    exec new_module_code in new_module.__dict__\n"
        script += "    sys.modules[name] = new_module\n\n"
        script += "def to_s3(obj, conn, bucket_id, key, **kwargs):\n"
        script += "    if type(obj)==str:\n"
        script += "        obj_string = zlib.compress(obj)\n"
        script += "    else:\n"
        script += "        try:\n"
        script += "            obj_string = zlib.compress(dumps(obj))\n"
        script += "        except:\n"
        script += "            raise Exception('obj could not be pickled')\n"
        script += "    if type(conn) is s3.S3Connection:\n"
        script += "        try:\n"
        script += "            bucket = conn.get_bucket(bucket_id)\n"
        script += "        except:\n"
        script += "            bucket = conn.create_bucket(bucket_id)\n"
        script += "    elif type(conn) is s3.bucket.Bucket:\n"
        script += "        bucket = conn\n"
        script += "    k = Key(bucket)\n"
        script += "    k.key = key\n"
        script += "    _ = k.set_contents_from_string(obj_string, **kwargs)\n"
        script += "    return key\n\n"

        # establish an s3 connection
        script += "conn_s3 = s3.S3Connection('%(aki)s', '%(sak)s')\n" % {
            "aki": self.access_key_id,
            "sak": self.secret_access_key,
        }

        # install any non-python programs necessary
        if aptget is not None:
            script += "subprocess.call([%s, %s])\n" % (repr("apt-get"), repr("upgrade"))
            for a in aptget:
                script += "subprocess.call([%s, %s, %s, %s])\n" % (
                    repr("apt-get"),
                    repr("install"),
                    repr("-y"),
                    repr(a),
                )

        # install pip then install modules
        if len(setup_specs.installs) > 0:
            if mask is not None:
                if "python-pip" not in mask:
                    script += "subprocess.call([%s, %s, %s])\n" % (repr("apt-get"), repr("install"), repr("python-pip"))
            for p in setup_specs.installs:
                script += "subprocess.call([%s, %s, %s, %s])\n" % (repr("pip"), repr("install"), repr("-y"), repr(p))

        # do anything else before opening the python interpreter
        if custom is not None:
            for c in custom:
                script += "subprocess.call([%s])\n" % repr(c)

        # import custom modules (not on path)
        if len(setup_specs.full_files) > 0:
            for item in setup_specs.full_files:
                module_name = item["name"]
                with open(item["path"], "r") as f:
                    module_code = f.read()
                module_code = self.quotes(module_code)
                script += "manual_module(quotes(%s, remove=False), %s, from_string=True)\n" % (
                    repr(module_code),
                    repr(module_name),
                )
                script += "import %s\n" % module_name
                script += "from %s import *\n" % module_name

        # import any other modules necessary for the function to run
        if len(setup_specs.imports) > 0:
            for i in setup_specs.imports:
                script += "%s\n" % i

        # recreate and run the function
        dumped_code = dumps(func)
        dumped_code = self.quotes(dumped_code)
        script += "do_func_code = quotes(%s, remove=False)\n" % repr(dumped_code)
        script += "do_func = pickle.loads(do_func_code)\n"
        script += "f_glob = do_func.func_globals\n"
        script += "exports = [x for x in f_glob if not x.startswith(chr(95)*2)]\n"
        script += "for name in exports:\n"
        script += "    globals()[name]  = f_glob[name]\n"

        if func_kwargs is None:
            script += "output = do_func()\n"
        else:
            script += "kwargs = quotes(%s, remove=False)\n" % (repr(self.quotes(dumps(func_kwargs))))
            script += "kwargs = pickle.loads(kwargs)\n"

            script += "output = do_func(**kwargs)\n"

        # transfer results to s3
        script += "key_id = str(uuid.uuid1())\n"
        script += "to_s3(output, conn=conn_s3, bucket_id=%s, key=key_id)\n" % (repr(self.collection_bucket.name))
        # script += 'print %s\n' % repr('working script completed')
        return script.strip("\n")
Ejemplo n.º 10
0
    def get_objects(self, func, mask="anaconda"):
        """
        Return a class with three attributes:
        
        * imports: python modules that need to be imported for a function to be
        run on EC2
        * installs: python modules that will first need to be installed (via
        apt-get) in order for a function to be run on EC2
        * full_files: local paths to scripts that will need to be imported
        as custom modules in order for a function to be run on EC2
        
        """
        imports_list = self.extract_code_dependencies(func)
        inter = [k for k, v in imports_list.items() if self.flag_interactive_objects(v)]
        del v
        n_inter = len(inter)
        new_n_inter = n_inter * 2
        while new_n_inter > n_inter:
            for k in inter:
                new_imports = self.extract_code_dependencies(imports_list[k])
                imports_list.update(new_imports)
                inter = [k for k, v in imports_list.items() if self.flag_interactive_objects(v)]
                del v
                n_inter = new_n_inter
                new_n_inter = len(inter)

        try:
            parent_module = func.__module__
        except:
            parent_module = "__main__"

        not_in_keys = parent_module not in imports_list.keys()
        not_in_main = parent_module != "__main__"

        if not_in_keys & not_in_main:
            imports_list[parent_module] = sys.modules[parent_module]

        line_items = []
        apt_get = []
        load_files = []
        for alias, obj in imports_list.items():
            if inspect.ismodule(obj):
                line_item = "import %s" % obj.__name__
                root_module = obj.__file__.split("/")
            elif obj.__module__ != "__main__":
                line_item = "from %s import %s" % (obj.__module__, obj.__name__)
                root_module = inspect.getsourcefile(obj).split("/")
            else:
                pickled_obj = dumps(obj)
                line_item = "%s = pickle.loads(quotes(%s, remove=False))\n" % (alias, repr(self.quotes(pickled_obj)))
                root_module = None

            if obj.__name__ != alias:
                line_item += " as %s" % alias

            ind = None

            if root_module is not None:
                try:
                    ind = root_module.index("site-packages")
                except:
                    try:
                        _ = root_module.index("python%d.%d" % (sys.version_info[0:2]))
                    except:
                        source_file = inspect.getsourcefile(obj)
                        load_files.append({"name": alias, "path": source_file})

                if ind is not None:
                    root_module = root_module[ind + 1]
                    apt_get.append(root_module)

            line_items.append(line_item)

        if mask is not None:
            apt_get = [item for item in apt_get if not any([item.startswith(x) for x in self.lib_dict[mask]])]

        class script_setup(object):
            imports = list(set(line_items))
            installs = list(set(apt_get))
            full_files = load_files

        return script_setup
Ejemplo n.º 11
0
 def test_lambda_cp(self):
     import cloud.serialization.cloudpickle as cp
     f = lambda filename: open(filename, 'r').close()
     f_dill = cp.loads(cp.dumps(f))
     f_dill("test.txt")