def extract_dialogues(filename, pkl_filename, restaurant_db):
    """
    Extract dialogues from given filename as list of lists
    :param filename:
    :return:
    """
    dialogues = []

    # Create DB
    if not os.path.exists(restaurant_db):
        conn = sqlite3.connect(restaurant_db)
        c = conn.cursor()
        print "Creating DB"
        c.execute("""CREATE TABLE Restaurants (name text unique, post_code text, cuisine text, location text,
              phone text, address text, price text, rating text)""")
        conn.commit()
    else:
        conn = sqlite3.connect(restaurant_db)
        c = conn.cursor()


    with open(filename, "r") as f:
        exchanges = []
        # (Post_code, cuisine, location, phone, address, price, rating)
        api_results = []

        for line in f:
            # Signifies that end of dialogue has been reached so
            # output utterances
            if line == "\n":
                dialogues.append(exchanges)
                restaurants = process_api_results(api_results)

                # Update restaurants in DB
                if len(restaurants) != 0:
                    for r in restaurants:
                        c.execute("INSERT OR IGNORE INTO Restaurants VALUES "
                                  "(?,?,?,?,?,?,?,?)", r)
                        conn.commit()

                exchanges = []
                api_results = []
                continue

            contents = line.strip().split("\t")
            if len(contents) == 1:
                clean_contents = " ".join(contents[0].strip().split(" ")[1:])
                if clean_contents != "" and clean_contents != "api_call no result":
                    api_results.append(clean_contents)

            else:
                user, system = contents[0], contents[1]
                user = "******".join(user.split(" ")[1:])

                exchanges.append((user, system))


    print "Dialogues: ", len(dialogues)
    with open(pkl_filename, "wb") as f:
        pickle.dump(dialogues, f)
def encrypt(key, path="message.txt", saveCT="ciphertext.enc"):
    b = random.randrange(2, (key.p)-1)
    u = modexp(key.g, b, key.p)
    v = modexp(key.h, b, key.p)

    uv = str(u)+str(v)
    k = SHA224.new(uv.encode('utf-8')).hexdigest().encode('utf-8') #symmetric key for compute the ciphertext with AES
    print("K: "+str(k))

    # Open file plaintext to cipher
    plaintext = open(path,"rb").read()
    #plaintext = encode(plaintext, key.iNumBits)

    bs = Blowfish.block_size
    iv = Random.new().read(bs)
    cipher = Blowfish.new(k, Blowfish.MODE_CBC, iv)
    plen = bs - divmod(len(plaintext),bs)[1]
    padding = [plen]*plen
    padding = struct.pack('b'*plen,*padding)
    ciphertext = iv + cipher.encrypt(plaintext+padding)

    # Save ciphertext to file:
    print("CT-LEN:"+str(len(ciphertext)))
    with open(saveCT, 'wb') as output:
        dill.dump(u, output)
        dill.dump(ciphertext, output)

    return plaintext, ciphertext
Exemple #3
0
def grid_search(X, y):
    '''
    cross validated grid search using Ridge Regressor and Random
    Forest Regressor
    '''

    nids = df_subset.index
    titles = df_subset['title']

    pars = {'alpha': [0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1,
                      0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02]}

    gs = GridSearchCV(Ridge(), pars, cv=5)
    gs.fit(X, y)

    ridge = gs.best_estimator_
    dill.dump(ridge, open('ridge.pkl', 'wb'))

    pars = {'max_depth': [5, 8, 10, 20, 50, 100],
            'min_samples_split': [2, 3, 5, 10, 20]}

    gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2),
                      pars, cv=5)
    rfr = gs.best_estimator_
    dill.dump(rfr, open('rfr.pkl', 'wb'))
    return ridge, rfr
    def save(self, experiment_dir):
        """
        Saves the current model and related training parameters into a subdirectory of the checkpoint directory.
        The name of the subdirectory is the current local time in Y_M_D_H_M_S format.
        Args:
            experiment_dir (str): path to the experiment root directory
        Returns:
             str: path to the saved checkpoint subdirectory
        """
        date_time = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())

        self._path = os.path.join(experiment_dir, self.CHECKPOINT_DIR_NAME, date_time)
        path = self._path

        if os.path.exists(path):
            shutil.rmtree(path)
        os.makedirs(path)
        torch.save({'epoch': self.epoch,
                    'step': self.step,
                    'optimizer': self.optimizer
                   },
                   os.path.join(path, self.TRAINER_STATE_NAME))
        torch.save(self.model, os.path.join(path, self.MODEL_NAME))

        with open(os.path.join(path, self.INPUT_VOCAB_FILE), 'wb') as fout:
            dill.dump(self.input_vocab, fout)
        with open(os.path.join(path, self.OUTPUT_VOCAB_FILE), 'wb') as fout:
            dill.dump(self.output_vocab, fout)

        return path
Exemple #5
0
def dill_save(obj, name, folder='pc'):
    """This scripts saves any kind of object as a dill file in a folder.

    Args:
        obj:		object you want to save in an pkl file
        name:		name of pkl file, '.pkl' will be added automatically if missing
    """
    from pencilnew.io.mkdir import mkdir
    from os import remove
    from os.path import join, exists
    import dill

    mkdir(folder)        ## prepare folder

    if (not name.endswith('.dill')): name = name+'.dill'
    if folder=='pc' and name.startswith('pc/'): name=name[3:]

    full_path = join(folder, name)

    if exists(full_path): remove(full_path)

    with open(join(folder, name), 'wb') as f:
        dill.dump(obj, f)

    return True
Exemple #6
0
def save_dump(filename, tb=None):
    """
    Saves a Python traceback in a pickled file. This function will usually be called from
    an except block to allow post-mortem debugging of a failed process.

    The saved file can be loaded with load_dump which creates a fake traceback
    object that can be passed to any reasonable Python debugger.

    The simplest way to do that is to run:

       $ pydump.py my_dump_file.dump
    """
    if not tb:
        tb = sys.exc_info()[2]
    fake_tb = FakeTraceback(tb)
    _remove_builtins(fake_tb)
    dump = {
        "traceback": fake_tb,
        "files": _get_traceback_files(fake_tb),
        "dump_version": DUMP_VERSION,
    }
    with gzip.open(filename, "wb") as f:
        if dill is not None:
            dill.dump(dump, f)
        else:
            pickle.dump(dump, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #7
0
def save_session(fname=None, session=None, pickleProto=4):
    import dill as pickle

    if fname is None:
        fname = conf.session
        if not fname:
            conf.session = fname = utils.get_temp_file(keep=True)
            log_interactive.info("Use [%s] as session file" % fname)
    if session is None:
        session = builtins.__dict__["kamene_session"]

    to_be_saved = session.copy()
        
    for k in list(to_be_saved.keys()):
        if k in ["__builtins__", "In", "Out", "conf"] or k.startswith("_") or \
                (hasattr(to_be_saved[k], "__module__") and str(to_be_saved[k].__module__).startswith('IPython')):
            del(to_be_saved[k])
            continue
        if type(to_be_saved[k]) in [type, types.ModuleType, types.MethodType]:
             log_interactive.info("[%s] (%s) can't be saved." % (k, type(to_be_saved[k])))
             del(to_be_saved[k])

    try:
        os.rename(fname, fname+".bak")
    except OSError:
        pass
    f=gzip.open(fname,"wb")
    for i in to_be_saved.keys():
        #d = {i: to_be_saved[i]}
        #pickle.dump(d, f, pickleProto)
        pickle.dump(to_be_saved, f, pickleProto)
    f.close()
def qsubwrap(fun, *args,
             pickle_dir=None, messages_dir=None,
             queue=default_queue, extra_options='',
             verbose=False,
             **kwargs):
    """Submit a qsub job to call fun(*args, **kwargs)
        pickledir will be used to exchange input/output via pickles. Defaults to os.cwd() / qsub_pickles
        messages_dir will contain the files with stdout and stderr from the jobs. Defaults to os.cwd() / qsub_messages
        queue: name of the queue qsub will submit the job to
        extra_options will be passes directly to the qsub command.
    returns: job_name, filename of pickle which will contain result of function when the job is done
    """
    job_name = fun.__name__ + '%09d' % randint(0, 1e9)
    if verbose:
        print("Starting submission of job %s" % job_name)

    if pickle_dir is None:
        pickle_dir = os.path.join(os.getcwd(), 'qsub_pickles')
    if not os.path.exists(pickle_dir):
        os.makedirs(pickle_dir)
    if messages_dir is None:
        messages_dir = os.path.join(os.getcwd(), 'qsub_messages')
    if not os.path.exists(messages_dir):
        os.makedirs(messages_dir)
    extra_options += ' -e localhost:{messages_dir} -o localhost:{messages_dir}'.format(messages_dir=messages_dir)

    input_pickle_name = os.path.join(pickle_dir,
                                     strftime('input_%Y%m%d_%H%M%S_')
                                     + job_name + '.pickle')
    output_pickle_name = os.path.join(pickle_dir,
                                      strftime('output_%Y%m%d_%H%M%S_')
                                      + job_name + '.pickle')

    if verbose:
        print("Writing input pickle for job %s" % job_name)
    with open(input_pickle_name, 'wb') as input_pickle:
        pickle.dump(dict(fun=fun, fun_args=args, fun_kwargs=kwargs), input_pickle)

    # Make the python script for this job
    py_script = tempfile.NamedTemporaryFile(suffix=".py", delete=False, mode='w')
    if verbose:
        print("Making py script pickle for job %s: %s" % (job_name, py_script.name))
    py_script.write(py_script_template.format(input_pickle_name=input_pickle_name,
                                              output_pickle_name=output_pickle_name,
                                              python_path=sys.executable))
    py_script.close()
    make_executable(py_script.name)

    # Submit the script to qsub
    # Now we'll learn our actual jobname from qsub, which we return
    cmd = submission_command_template.format(queue=queue,
                                             script_name=py_script.name,
                                             messages_dir=messages_dir,
                                             extra_options=extra_options,
                                             job_name=job_name)
    if verbose:
        print("qsub command for job %s: %s" % (job_name, cmd))
    job_name = subprocess.check_output(cmd, shell=True).decode('utf-8').rstrip()

    return job_name, output_pickle_name
Exemple #9
0
def dump(object, **kwds):
    """dill.dump of object to a NamedTemporaryFile.
Loads with "dill.temp.load".  Returns the filehandle.

    >>> dumpfile = dill.temp.dump([1, 2, 3, 4, 5])
    >>> dill.temp.load(dumpfile)
    [1, 2, 3, 4, 5]

Optional kwds:
    If 'suffix' is specified, the file name will end with that suffix,
    otherwise there will be no suffix.
    
    If 'prefix' is specified, the file name will begin with that prefix,
    otherwise a default prefix is used.
    
    If 'dir' is specified, the file will be created in that directory,
    otherwise a default directory is used.
    
    If 'text' is specified and true, the file is opened in text
    mode.  Else (the default) the file is opened in binary mode.  On
    some operating systems, this makes no difference.

NOTE: Keep the return value for as long as you want your file to exist !
    """
    import dill as pickle
    import tempfile
    file = tempfile.NamedTemporaryFile(**kwds)
    pickle.dump(object, file)
    file.flush()
    return file
Exemple #10
0
def build_regression(transformer, limit=1000000, db_name=db.DB_NAME):
    conn=sqlite3.connect(db_name)
    c=conn.cursor()
    c.execute("select * from match_data limit "+str(limit))
    alldata=c.fetchall()
    train_data1=map(itemgetter(slice(1,6)),alldata)
    train_data2=map(itemgetter(slice(6,11)),alldata)
    train_wins=map(itemgetter(11), alldata)
    del alldata
    params=[(i,j) for i in range(0,6) for j in range(0,6) if i+j>=4 and i<=j]
    clf={k:LogisticRegression(C=20./(k[0]+k[1])**4) for k in params}
    for (i,j) in params:
        train1=[]
        train2=[]
        for row_num in range(len(train_data1)):
            train1.append(sorted(random.sample(train_data1[row_num],i)))
            train2.append(sorted(random.sample(train_data2[row_num],j)))
        clf[(i,j)].fit(map(matrixitemgetter(0),transformer.transform(train1,train2)),train_wins)
        print str(i)+","+str(j)
    with open("clf_all-"+str(int(time())) +".dill", "w+") as f:
        dill.dump(clf, f)
    with open("transf-"+str(int(time())) +".dill", "w+") as f:
        dill.dump(transformer, f)
    print 'transformer and classifier saved'
        
 def save(self, filename):
   #save tf graph
   save_path = self.saver.save(self.session, filename);
   # save class pareter
   filename_class = filename + '.pickle';
   pickle.dump([self.model, self.images], open(filename_class, 'wb'));
   print("WormVision saved to files: %s, %s" % (save_path, filename_class));
Exemple #12
0
    def save(self, filename):
        """
        Save model to pickle file. External feature function is not stored
        """
        import dill

        tmpmodelparams = self.modelparams.copy()
        # fv_extern_src = None
        fv_extern_name = None
        # try:
        #     fv_extern_src = dill.source.getsource(tmpmodelparams['fv_extern'])
        #     tmpmodelparams.pop('fv_extern')
        # except:
        #     pass

        # fv_extern_name = dill.source.getname(tmpmodelparams['fv_extern'])
        if "fv_extern" in tmpmodelparams:
            tmpmodelparams.pop("fv_extern")

        sv = {
            "modelparams": tmpmodelparams,
            "mdl": self.mdl,
            # 'fv_extern_src': fv_extern_src,
            # 'fv_extern_src_name': fv_extern_src_name,
            # 'fv_extern_name': fv_extern_src_name,
            #
        }
        sss = dill.dumps(self.modelparams)
        logger.debug("pickled " + str(sss))

        dill.dump(sv, open(filename, "wb"))
def main():
    data = pickle.load(open("reuters_raw.pickle", "rb"))
    corpus = list()
    titles = list()
    topics = list()
    days = list()
    doc_grp_id = list()
    start_date = data[0][0]["date"][0].split("-")[:-1]
    cumulative_months, Months_lists = get_months()
    j = 1
    for article in data:
        if "text" in article[0] and "topics" in article[0]:
            topic = article[0]["topics"]
            if len(topic) != 1:
                continue
            for piece in article[0]["text"]:
                if isinstance(piece, dict) and "body" in piece:
                    # print "hell"
                    titles.append(piece["title"][0])
                    corpus.append("".join(piece["body"]))
                    topics.append(topic[0]["d"][0])
                    date = article[0]["date"][0].split("-")[:-1]
                    days.append(date_convert(date, start_date, cumulative_months, Months_lists))
                    doc_grp_id.append(j)
                    j += 1
    print len(corpus)
    pickle.dump((corpus, topics, titles, days, doc_grp_id), open("reuters.pickle", "wb"))
Exemple #14
0
 def createFileList(self):
     """SRTM data is split into different directories, get a list of all of
         them and create a dictionary for easy lookup."""
     if self.protocol == "ftp":
         ftp = ftplib.FTP(self.server)
         try:
             ftp.login()
             ftp.cwd(self.directory)
             continents = ftp.nlst()
             for continent in continents:
                 print "Downloading file list for", continent
                 ftp.cwd(self.directory+"/"+continent)
                 files = ftp.nlst()
                 for filename in files:
                     self.filelist[self.parseFilename(filename)] = (
                             continent, filename)
         finally:
             ftp.close()
         # Add meta info
         self.filelist["server"] = self.server
         self.filelist["directory"] = self.directory
         with open(self.filelist_file , 'wb') as output:
             pickle.dump(self.filelist, output)
     else:
         self.createFileListHTTP()
def do_evaluation(LAss, opt, X, Z, TX, TZ, test_labels, train_labels, counter):
    """
    Evaluates opt on a certain parameter set.
    Parameters
    ----------
    LAss : ValidationLabAssistant
        The LAss to use for updates.
    opt : string
        The name of the experiment to use here.
    X, Z : matrix
        Feature and Target matrices of the training set, one-hot encoded.
    VX, VZ : matrix
        Feature and Target matrices of the validation set, one-hot encoded.
    """
    to_eval = LAss.get_next_candidate(opt)
    step_rate = to_eval.params["step_rate"]
    momentum = to_eval.params["momentum"]
    decay = to_eval.params["decay"]
    c_wd = to_eval.params["c_wd"]
    print opt,step_rate, momentum, decay, c_wd
    with open("apsis_pars_"+opt+str(counter)+".pkl", 'wb') as fp:
        dill.dump((LAss, opt, step_rate, momentum, decay, c_wd, counter, 0, 0), fp)
    result, n_iter = do_one_eval(X, Z, TX, TZ,test_labels, train_labels, step_rate, momentum, decay, c_wd, counter, opt)
    to_eval.result = result
    LAss.update(opt, to_eval)
    with open("apsis_pars_"+opt+str(counter)+".pkl", 'wb') as fp:
        dill.dump((LAss, opt, step_rate, momentum, decay, c_wd, counter, n_iter, result), fp)
    def create_reverse_index(self, documents_filename, common_words_filename):

        # Load reverse index if already exists, create it (and save it) otherwise.
        reverse_index_file = self.save_folder_path + self.ponderation_name + '.rev'
        if os.path.isfile(reverse_index_file):
            print 'Loading reverse index...',
            with open(reverse_index_file, 'rb') as in_strm:
                reverse_index = dill.load(in_strm)
            print 'done'
        else:
            print 'Loading raw documents...',
            # Parse the documents
            Parser = Parse_cacm('sources/cacm.all', 'sources/common_words')
            index = Parser.parse_file()
            print 'done'

            print 'Creating reverse index...',
            reverse_index = self.ponderation_method(index)
            reverse_index.other_infos['ponderation_method'] = self.ponderation_name
            reverse_index.other_infos['number_of_documents'] = len(index)

            with open(reverse_index_file, 'wb') as output:
                dill.dump(reverse_index, output, dill.HIGHEST_PROTOCOL)
            print 'done'

        return reverse_index
Exemple #17
0
        def __init__(self, process_obj):
            # create pipe for communication with child
            r, w = os.pipe()

            # get handle for read end of the pipe and make it inheritable
            rhandle = msvcrt.get_osfhandle(r)
            win32.SetHandleInformation(rhandle, win32.HANDLE_FLAG_INHERIT, win32.HANDLE_FLAG_INHERIT)

            # start process
            cmd = getCommandLine() + [rhandle]
            cmd = " ".join('"%s"' % x for x in cmd)
            hp, ht, pid, tid = _subprocess.CreateProcess(sys.executable, cmd, None, None, 1, 0, None, None, None)
            os.close(r)
            ht.Close()

            # set attributes of self
            self.pid = pid
            self.returncode = None
            self._handle = hp

            # send information to child
            prep_data = getPreparationData(process_obj._name)
            to_child = os.fdopen(w, "wb")
            tls.is_spawning = True
            try:
                dump(prep_data, to_child, HIGHEST_PROTOCOL)
                dump(process_obj, to_child, HIGHEST_PROTOCOL)
            finally:
                tls.is_spawning = False
                to_child.close()
Exemple #18
0
def write(obj, filename):
    os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__))))
    filename = os.path.join(os.getcwd(), filename)
    
    with open(filename, 'wb') as output_:
        dill.dump(obj, output_)
        print "Object saved to file: ", filename
Exemple #19
0
def dump_trials(trials, path):

    print('Size of object: ' + str(len(trials)))
    print('Dumping')
    file = open(path, 'wb')
    cPickle.dump(trials, file)
    file.close()
 def hotstart(self, FU, model, tTransformer1, tTransformer2=None, m_root='../models'):
     """Aggregate hotstart chunk & train FeatureUnion and Model.  
     Requires pre-fit target transformer"""
     self.m_root = m_root
     print 'Extracting...'
     train = []
     for i in self.hotstart_idx:
         filename = self.chunklist[i]
         zf = zipfile.ZipFile('{}/{}'.format(self.d_root,filename))
         train += json.loads(zf.read(zf.namelist()[0]))['results']
     self.total_records += len(train)
     print 'Transforming Target...'
     # grab hotstart target
     self.tTransformer1 = tTransformer1
     self.tTransformer2 = tTransformer2
     if tTransformer2 is not None:
         target = self.tTransformer1.transform(train)
         target = self.tTransformer2.transform(target)
     else:
         target = self.tTransformer1.transform(train)
     print 'Extracting Features...'
     # fit, then transform features
     self.FU_fitted = FU.fit(train,0)
     features = self.FU_fitted.transform(train)
     # initialize and fit the model
     print "Modeling..."
     self.model = model
     self.model.fit(features, target)
     # pickle-it
     with open('{}/{}_hotstart.pkl'.format(m_root,self.modelname), 'wb') as output:
         pickle.dump(self.model, output)
     print 'Done!  {} records processed'.format(self.total_records)
     print 'Pickled as {}_hotstart'.format(self.modelname)
 def persist(self):
     pkl_file_path = os.path.join(self.store_dir, "test_stored_env.pkl")
     dill.settings['byref'] = True
     
     with open(pkl_file_path, 'w+') as f:
         dill.detect.trace(True)
         dill.dump(self, f)
Exemple #22
0
def save_default_values(dsp, path):
    """
    Write Dispatcher default values in Python pickle format.

    Pickles are a serialized byte stream of a Python object.
    This format will preserve Python objects used as nodes or edges.

    :param dsp:
        A dispatcher that identifies the model adopted.
    :type dsp: schedula.Dispatcher

    :param path:
        File or filename to write.
        File names ending in .gz or .bz2 will be compressed.
    :type path: str, file

    .. testsetup::
        >>> from tempfile import mkstemp
        >>> file_name = mkstemp()[1]

    Example::

        >>> from schedula import Dispatcher
        >>> dsp = Dispatcher()
        >>> dsp.add_data('a', default_value=1)
        'a'
        >>> dsp.add_function(function=max, inputs=['a', 'b'], outputs=['c'])
        'max'
        >>> save_default_values(dsp, file_name)
    """
    import dill
    with open(path, 'wb') as f:
        dill.dump(dsp.default_values, f)
Exemple #23
0
def send_tasks(server, futures_chunk): 
    print('Sending tasks to %s' % server)
    with open('%s_tasks.pkl' % server, 'wb') as taskfile:
        pickle.dump(futures_chunk, taskfile)
    # Copy tasks file to server
    subprocess.call(['scp', '%s_tasks.pkl' % server,  '%s:' % server])
    os.remove('%s_tasks.pkl' % server)
    def __init__(self, args, siteInfo):
        self.args = args
        self.ontology = Ontology()
        self.histMark = siteInfo.histMark
        self.assayType = siteInfo.assayType

        def dd():
            return defaultdict(None)
        self.byAssemblyAssays = defaultdict(dd)

        allAssays = ["BothDNaseAnd" + self.histMark, self.histMark, "DNase"]

        print("WebEpigenomesLoader:", self.histMark, self.assayType)
        if 0:
            self._generate(allAssays)
            fn = "webEpigenomesLoader.byAssemblyAssays.{histMark}.dill".format(histMark=self.histMark)
            # outFnp = "/data/projects/encode/encyclopedia_v3/
            outFnp = os.path.join(os.path.dirname(__file__), "../../../", fn)
            with open(outFnp, 'wb') as f:
                dill.dump(self.byAssemblyAssays, f)
            print("wrote", outFnp)
        else:
            fn = "webEpigenomesLoader.byAssemblyAssays.{histMark}.dill".format(histMark=self.histMark)
            # outFnp = "/data/projects/encode/encyclopedia_v3/
            outFnp = os.path.join(os.path.dirname(__file__), "../../../", fn)
            with open(outFnp, 'rb') as f:
                self.byAssemblyAssays = dill.load(f)
            print("read", outFnp)
def sub():
	while(1):
		X, data_cor_cid = main()
		if( data_cor_cid > 50):
			pickle.dump(X, open('Y.pickle', 'wb'))
			print X, data_cor_cid
			break
Exemple #26
0
def mh_wrapper(pc, expt, data_params, inference_params, return_stats=False, 
               results_folder='./', log_folder='./', specified_start=None):
    print(datetime.datetime.now())
    print_save_params(data_params, inference_params, results_folder)
    data = wrapper.get_data(pc, expt, data_params)
    data = wrapper.process_data(data, inference_params)
    1/0
    if inference_params.meta_noise_type == "beta":
        data['metas'] = [np.clip(m, .01, .99) for m in data['metas']]
    if expt == "generate":
        data.save_all(results_folder)
        dill.dump(data, open(results_folder + 'data.pkl', 'w'))
    if pc == "mac":
        parallel = False
    else:
        parallel = True
    lview = setup_parallel(parallel)
    if inference_params.separate:
        init_state = wrapper.make_init_for_mh(inference_params.vars_to_init,
                                              data, specified_start)
        q_results = wrapper.mh_separate(data['beliefs'], data['metas'],
                                        inference_params, lview, init_state, 
                                        results_folder, log_folder, return_stats)
        indiv_results = None
    else:
        if inference_params.num_chains > 1:
            sys.exit("Multiple chains not implemented for non-separate qs")
        results = wrapper.mh_across(data['beliefs'], data['metas'], 
                                    inference_params, lview, results_folder,
                                    log_folder, return_stats)
        q_results, indiv_results = results
    print(datetime.datetime.now())
    return q_results, indiv_results, data
Exemple #27
0
	def add_tournament_replay(self, replay_file_path):
		# Ignore duplicate replays
		replay_id = hashlib.sha1(
			open(replay_file_path, mode = 'rb').read()
			).hexdigest()

		if replay_id in self.replay_ids:
			self.logger.warning('Replay already exists in database - {}'.format(replay_file_path))
			return

		replay = self.parser.load_replay(replay_file_path)
		if not replay:
			return

		self.logger.warning('Adding - {}'.format(replay_file_path))
		self.replay_ids.add(replay_id)

		# Add new information to the tree
		for player in replay.players:
			node_kwargs = {
			'player_name' : player.name,
			'player_race' : player.play_race,
			'player_url' : player.url,
			'hotkey_info' : self.parser.extract_hotkey_info(replay, player),
			}
			node = Node(**node_kwargs)
			self.tree_nodes.append(node)
			self.tree.add(node)

		# Serialize
		with open(constants.PATH_REPLAY_IDS, mode = 'wb') as f:
			pickle.dump(self.replay_ids, f)

		with open(constants.PATH_TREE_NODES, mode = 'wb') as f:
			pickle.dump(self.tree_nodes, f)
def main():
    if len(sys.argv) != 3:
        usage()

    gtf, fa = sys.argv[1:3]

    try:
        with open('gencode_transcript_ids.pkl', 'r') as pklf:
            transcript_ids = dill.load(pklf)
    except IOError:
        transcript_ids = set()
        gtf_file = HTSeq.GFF_Reader(gtf)
        for feature in gtf_file:
            if feature.type == 'transcript':
                transcript_ids.add(feature.attr['transcript_id'])
        with open('gencode_transcript_ids.pkl', 'w') as pklf:
            dill.dump(transcript_ids, pklf)

    print '\n\n### Finished reading in GTF ###\n\n'

    fa_file = HTSeq.FastaReader(fa)
    fa_fout = open('output.fa', 'w')
    for fa in fa_file:
        ids = fa.name.split('|')
        if ids[0] in transcript_ids:
            fa.write_to_fasta_file(fa_fout)
    fa_fout.close()
def get_price_history(pris_id, produkt_id):
	fname = price_history_path + 'price_history_{0}_{1}.dill'.format(str(pris_id), str(produkt_id))
	if not os.path.exists(fname):
		delayer.delay()
		t = str(int(float(time.time())*1000)) 			# time stamp
		ID = str(int(int(produkt_id)/79))[:3]				# bullshit ID
		url = 'https://www.prisjakt.nu/ajax/jsonajaxserver.php?m=get_prod_prishist&p={"pris_id":' + str(pris_id) + ',"produkt_id":' + str(produkt_id) + '}&t=' + t + '&id=' + ID

		goon = True
		n_tries = 0
		while goon and n_tries < 10:
			try:
				r = requests.get(url)
				goon = False
			except:
				print 'Error number {0} in price history page'.format(str(n_tries + 1))
				sleep(30)
				n_tries = n_tries + 1

		with open(fname, 'wb') as out_file:
			dill.dump(r, out_file)
	else:
		with open(fname, 'rb') as in_file:
			r = dill.load(in_file)
	return r
Exemple #30
0
def commit():
    """
    Writes the modified state to the currently being modified save file.
    """
    print('committing changes')
    with open(os.path.join('save', save + '.sav'), 'wb') as saveFile:
        dill.dump(state, saveFile) 
def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():
        title = title.replace("DeerSlayer", 'Deerslayer', 1)
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        author = ''  # TODO: figure this out
        archived_local = archived
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        soup = get_soup(url, sleep=SLEEP)
        table = soup.find('div', id='block-booknavigation-3') or soup.find(
            'div', id='block-block-4')

        # process plot summary
        plot_summ = None
        plot_cell = table.find('a', href=RE_PLOT_LINK)
        if plot_cell:
            plot_title = plot_cell.get_text()
            href = plot_cell['href']
            if archived:
                plot_link = get_orig_url(href)
                plot_link = get_archived(plot_link, update_old)
                if 'archive.org' not in plot_link:  # failed to retrieve archived version
                    # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted
                    time.sleep(5.0)
                    archived_local = False
            else:
                plot_link = urllib.parse.urljoin(url, href)
            if 'Chapter' not in plot_title:
                plot_summ = process_plot(plot_link)
            if not plot_summ:
                print('  no plot summary found', plot_link)

        # process section summaries
        cells = table.find_all('a', href=RE_SUMM_LINK)
        if title == "The Brothers Karamazov":
            cells = sort_cells(cells)
        section_summs = []

        if not cells:
            print('  no section links found for', url)
            continue

        seen_sects = set()
        for c in cells:
            section_title = get_clean_text(c)
            section_title_chap = section_title.rsplit(':', 1)[-1]
            if section_title_chap in seen_sects:
                print('  seen {} already, skipped'.format(section_title_chap))
                continue
            if re.match(RE_PLOT, section_title):
                continue

            if archived and archived_local:
                link_summ = get_orig_url(c['href'])
                link_summ = get_archived(link_summ, update_old)
            else:
                link_summ = urllib.parse.urljoin(url, c['href'])

            try:
                page_summs = process_story(link_summ)
            except AttributeError:  # page failed to load, try again
                print('  retrying after 5 seconds...')
                time.sleep(5.0)
                page_summs = process_story(link_summ)

            if page_summs:
                section_summs.extend(page_summs)
                seen_sects.add(section_title_chap)
        if not section_summs:
            print('  could not find summaries for {}'.format(title))
            continue
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=plot_summ,
                                source='novelguide',
                                section_summaries=section_summs)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from novelguide'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries
Exemple #32
0
other_arguments = other_default_arguments()
#tune_settings_dict = tuning_settings([],[],[],[])
tune_settings_dict = tuning_settings(dual_args_list,[],adapt_cov_arguments,other_arguments)
tune_dict  = tuneinput_class(input_dict).singleton_tune_dict()

sampler1 = mcmc_sampler(tune_dict=tune_dict,mcmc_settings_dict=mcmc_meta,tune_settings_dict=tune_settings_dict)


store_name = 'normal_fc1_sampler.pkl'
sampled = False
if sampled:
    sampler1 = pickle.load(open(store_name, 'rb'))
else:
    sampler1.start_sampling()
    with open(store_name, 'wb') as f:
        pickle.dump(sampler1, f)
#out = sampler1.start_sampling()


mcmc_samples_hidden_in = sampler1.get_samples_alt(prior_obj_name="hidden_in",permuted=False)
print(mcmc_samples_hidden_in["samples"].shape)

print(mcmc_samples_hidden_in["samples"][0,10,5])
print(mcmc_samples_hidden_in["samples"][1,10,5])
#exit()
mcmc_samples_hidden_out = sampler1.get_samples_alt(prior_obj_name="hidden_out",permuted=False)

#print(mcmc_samples_beta["indices_dict"])
#exit()

samples = mcmc_samples_hidden_in["samples"]
def matching(dillpath, n_iter):
    # データのロード
    with open("tmp/dills/" + dillpath + "parsed_test.dill", "rb") as f:
        prunned = dill.load(f)
    keys = prunned.keys()
    size = 0  # データの次元. 2*オブジェクト数
    goal = 0  # 終了状態のステップ数
    datas = {}
    for filename in keys:
        datas[filename] = []
        goal = prunned[filename][-1]
        logTestName = dillpath[11]
        filepath = "tmp/log_test_" + logTestName + "/" + filename + ".csv"
        with open(filepath, "r", encoding="utf-8") as f:
            while True:
                line = f.readline().split(",")
                if len(line) < 2:
                    break
                if size == 0:
                    size = len(line[5:-1])
                datas[filename].append([float(l) for l in line[5:-1]])

    # 共通境界推定
    output = {}
    before = {}
    for filename in keys:
        output[filename] = []
        before[filename] = prunned[filename][0]
    while True:
        # 現段階の before を保存
        for filename in keys:
            output[filename].append(before[filename])
        after = {}
        for fn in keys:
            # before+e より大きい最小の step
            # before+e 以降に境界がないなら終了状態にする
            later = [s for s in prunned[fn] if s > before[fn]+e \
                and isDefferent(datas[fn][before[fn]], datas[fn][s])]
            if len(later) > 0:
                after[fn] = later[0]
            else:
                after[fn] = prunned[fn][-1]

        for filename in before.keys():
            print(str(before[filename]) + "\t--> " + str(after[filename]))
        # sleep(10)

        # 終了条件
        flagList = [a == goal for a in after.values()]
        flag = reduce(lambda x, y: x and y, flagList)
        if flag == True:
            # 最終結果を出力する
            for filename in keys:
                output[filename].append(after[filename])
            break

        # ---------------------------------------------------
        # 要するに,この部分を一回しかやらないのが間違ってる
        # ・サンプリングにより predict を取得
        # ・predict を after にしてもう一度サンプリング
        # ・predict = after になるまで繰り返す
        # ・predict を output に追加
        # ・before <- predict

        while True:
            # サンプリング学習を n_iter 回行う
            modelList = []
            for _ in range(n_iter):

                # size 個サンプリングして連立方程式を解く
                keyList = list(keys)
                np.random.shuffle(keyList)
                X = []
                y = []
                for k in keyList[:size]:
                    X.append(datas[k][before[k]])
                    y.append(datas[k][after[k]])

                # ここの転置忘れてた

                X = np.array(X).T
                y = np.array(y).T

                Xinv = np.linalg.inv(X)
                A = y.dot(Xinv)

                # 解いた結果の A で全データに対して再現精度を求める
                res = 0
                for k in keys:
                    b = np.array(datas[k][before[k]])
                    a = np.array(datas[k][after[k]])
                    r = A.dot(b)
                    res += np.linalg.norm(r - a)
                modelList.append((A, 1.0 / res))

            sumexp = sum([m[1] for m in modelList])

            # 次を推定する
            predict = {}
            for filename in keys:
                predict[filename] = []
                for m in modelList:
                    beforeData = datas[filename][before[filename]]
                    beforeData = np.array(beforeData)
                    predict[filename].append(m[0].dot(beforeData))
                    predict[filename][-1] *= m[1] / sumexp
                predict[filename] = sum(predict[filename])

            # 推定結果に最も近い状態を datas から取得
            selected = {}
            for filename in keys:
                # 各ステップの状態との距離を計算する
                p = np.array(predict[filename])
                d = datas[filename]
                distList = [np.linalg.norm(np.array(l) - p) for l in d]
                # before ステップ以降のみを対象にする
                distList = distList[before[filename] + e:]
                # after を選ぶ段階で else によって 499 になっている場合
                # [before+e:] に要素が存在しない.その時は
                # そのまま before を返す
                if len(distList) == 0:
                    selected[filename] = before[filename]
                    continue
                selected[filename] = distList.index(min(distList))
                # before+e ステップ分抜かしているので足しておく
                selected[filename] += before[filename] + e

            # after == selected なら break
            if after == selected:
                break
            # after = predict にして元に戻す
            after = selected

        # ---------------------------------------------------

        # before <- selected
        before = selected

    with open("tmp/dills/" + dillpath + "matching.dill", "wb") as f:
        dill.dump(output, f)

    return output
    f.addSentence(["今日", "も", "また", "人", "が", "死んだよ"])
    f.addSentence(["今日", "も", "また", "雨", "が", "降ったよ"])
    f.toPrint()
    f.eliminateSentence(["今日", "も", "また", "雨", "が", "降ったよ"])
    f.toPrint()

    test = ["今日", "も", "また", "雨", "が", "降ったよ"]
    for i in range(1, 11):
        print(f.changeBoundary(test, i))
    """

    data = {}
    data["1"] = ["りんごぶどうみかんばななもも"]
    data["2"] = ["ももみかんばななりんごぶどう"]
    data["3"] = ["ぶどうばななみかんりんごもも"]
    data["4"] = ["ばななりんごみかんももぶどう"]
    data["5"] = ["みかんももばななりんごぶどう"]

    # print(f.reverseSentences(data))

    for i in range(1):
        with open("tmp/RefactedRest_result.dill", "wb") as g:
            ptime = datetime.now().timestamp()
            dill.dump(f.executeParsing(data, 300), g)
            ptime = datetime.now().timestamp() - ptime

    f.toPrint()
    res = f.debug_result()
    for r in res:
        print(r + "\t\t: " + str(res[r]))
Exemple #35
0
def main(**kwargs):
    logger.info("Your params:")
    logger.info(kwargs)

    # check compatibility if training is continued from previously saved model
    if kwargs['init_from'] is not None:
        logger.info("Check if I can restore model from {0}".format(kwargs['init_from']))
        # check if all necessary files exist
        assert os.path.isdir(kwargs['init_from']), "%s must be a a path" % kwargs['init_from']
        assert os.path.isfile(os.path.join(kwargs['init_from'], "config.pkl")), "config.pkl file does not exist in path %s" % kwargs['init_from']
        assert os.path.isfile(os.path.join(kwargs['init_from'], "textdata.pkl")), "textdata.pkl file does not exist in path %s" % kwargs['init_from']
        ckpt = tf.train.get_checkpoint_state(kwargs['init_from'])
        assert ckpt, "No checkpoint found"
        assert ckpt.model_checkpoint_path, "No model path found in checkpoint"

        # open old config and check if models are compatible
        with open(os.path.join(kwargs['init_from'], 'config.pkl'), 'rb') as f:
            saved_model_args = dill.load(f)
            need_be_same = ["cell_type", "num_hidden", "num_layers", "num_samples", "max_vocab_size"]
            for checkme in need_be_same:
                assert saved_model_args[checkme] == kwargs[checkme], "Command line argument and saved model disagree on '%s' " % checkme
        logger.info("Args checker. Load TextData")
        # open saved TextData
        textdata = TextData.load(os.path.join(kwargs['init_from'], 'textdata.pkl'))
    else:
        textdata = TextData(kwargs['data_path'], max_len=kwargs['max_len'], max_vocab_size=kwargs['max_vocab_size'])

    logger.info("Save config and textdata.")
    with open(os.path.join(kwargs['save_dir'], 'config.pkl'), 'wb') as f:
        dill.dump(kwargs, f)
    TextData.save(textdata, os.path.join(kwargs['save_dir'], 'textdata.pkl'))

    # Make triples.
    logger.info("Making triples")
    triples = textdata.make_triples(textdata.dataset)
    logger.info("Number of triples: {0}".format(len(triples[0])))
    decay_steps = len(triples[0])
    vocab_size = len(textdata.vocab)
    logger.info("actual vocab_size={0}".format(vocab_size))

    model = SkipthoughtModel(kwargs['cell_type'], kwargs['num_hidden'], kwargs['num_layers'],
                             kwargs['embedding_size'], vocab_size, kwargs['learning_rate'],
                             kwargs['decay_rate'], decay_steps, kwargs['grad_clip'],
                             kwargs['num_samples'], kwargs['max_len'])

    with tf.Session() as sess:
        init = tf.initialize_all_variables()
        sess.run(init)
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)

        if kwargs['init_from'] is not None:
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("Restored from {0}".format(ckpt.model_checkpoint_path))

        num_batches = len(triples[0])//kwargs['batch_size']
        loss_history = []
        for e in range(kwargs['num_epochs']):
            it = textdata.triples_data_iterator(triples[0], triples[1], triples[2],
                                                textdata.max_len, kwargs['batch_size'], shuffle=True)
            for b, batch in enumerate(it):
                train_op, loss, feed_dict = model.train_step(*batch)

                start_time = time.time()
                batch_loss, _ = sess.run([loss, train_op], feed_dict=feed_dict)
                batch_perplexity = math.exp(float(batch_loss)) if batch_loss < 300 else float("inf")
                end_time = time.time()

                loss_history.append(batch_loss)
                if b % kwargs['verbose'] == 0:
                            print(
                                "{}/{} (epoch {}), train_loss = {:.3f}, perplexity = {:.3f}, time/batch = {:.3f}" \
                                .format(e * num_batches + b,
                                        kwargs['num_epochs'] * num_batches,
                                        e, batch_loss, batch_perplexity, end_time - start_time))
                if (e * num_batches + b) % kwargs['save_every'] == 0 \
                        or (e == kwargs['num_epochs']-1 and b == num_batches-1): # save for the last result
                    checkpoint_path = os.path.join(kwargs['save_dir'], 'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=e * num_batches + b)
                    with open(os.path.join(kwargs['save_dir'], 'loss_history.pkl'), 'wb') as f:
                        dill.dump(loss_history, f)
                    print("model & loss_history saved to {}".format(checkpoint_path))
Exemple #36
0
                                                         train=False,
                                                         transform=transform),
                                          batch_size=args.test_batch_size,
                                          shuffle=True,
                                          **kwargs)

# Construct fingerprint patterns

# Choose xs
fp_dx = [np.random.rand(1, 1, 28, 28) * args.eps for i in range(args.num_dx)]
# fp_dx = [np.zeros(1,1,28,28)*args.eps for i in range(args.num_dx)]

# for i in range(args.num_dx):
#     k,l = random.randint(0,27), random.randint(0,27)

pickle.dump(fp_dx, open(os.path.join(args.log_dir, "fp_inputs_dx.pkl"), "wb"))

# Target ys
# num_target_classes x num_perturb x num_class
fp_target = -0.2357 * np.ones((args.num_class, args.num_dx, args.num_class))

for j in range(args.num_dx):
    for i in range(args.num_class):
        fp_target[i, j, i] = 0.7

pickle.dump(fp_target, open(os.path.join(args.log_dir, "fp_outputs.pkl"),
                            "wb"))

fp_target = util.np2var(fp_target, args.cuda)

fp = Fingerprints()
Exemple #37
0
def save(file, data, format=None, overwrite=False):
    """Save `data`.
    First the function checks if :param:data defines a `save()` method; if so,
    the method is called as `save(output_path)`. If this is successful, the
    function terminates.
    If the call is not successful, or :param:data does not define a `save()`
    method, then the function attempts to save to the formats defined by
    `format`. By default, only the 'numpy_repr' representation is saved,
    if `data` defines a numpy representation.
    Not only is the numpy representation format more future-proof, it can be an
    order of magnitude more compact.
    If the numpy_repr save is unsuccessful (possibly because `data` does not provide a
    `numpy_repr` method), then `save()` falls back to saving a plain (dill) pickle of 'data'.

    Parameters
    ----------
    file: str
        Path name or file object. Note that the file extension is mostly
        ignored and will be replaced by the one associated with the format.
        This is to allow saving to multiple formats.
    data: Python object
        Data to save
    format: str
        The format in which to save the data. Possible values are:
          - 'npr' (default) Save with the numpy_repr format. This is obtained by calling the
            method 'nprepr' on the `data`. If this call fails, a warning is issued
            and the 'dill' format is used.
            Output file have the extension 'npr'.
            Objects using this format should implement the `from_nprepr` method.
          - 'repr' Call `repr` on the data and save the resulting string to file. The save will
            fail (and fall back to 'dill' format) if the `repr` is simply inherited from object,
            as simply saving the object address is not useful for reconstructing it. Still, there
            is no way of ensuring that the `repr` is sufficiently informative to reconstruct the
            object, so make sure it is before using this format.
            Output file have the extension 'repr'.
            Objects using this format should implement the `from_repr` method.
          - 'dill' A dill pickle.
            Output file has the extension 'dill'
        Formats can also be combined as e.g. 'npr+dill'.
    overwrite: bool
        If True, allow overwriting previously saved files. Default is false, in
        which case a number is appended to the filename to make it unique.

    Returns
    -------
    List of output paths.
        List because many formats may be specified, leading to multiple outputs.
    """
    if isinstance(format, str):
        selected_formats = format
    else:
        if format is None:
            typename = find_registered_typename(type(data))
        else:
            if not isinstance(format, type):
                logger.error("The `format` argument should be either a string "
                             "or type. Provided value: {}"
                             "Attempting to infer type from data".format(format))
                typename = find_registered_typename(type(data))
            typename = find_registered_typename(format)
        if typename in _format_types:
            format = _format_types[typename]
        else:
            logger.error("Type '{}' has no associated format".format(typename))
            format = 'npr'

    selected_formats = set(format.split('+'))

    # Check argument - format
    bad_formats = [f for f in selected_formats if f not in defined_formats]
    selected_formats = selected_formats.difference(bad_formats)
    if len(bad_formats) > 0:
        format_names = ["'" + f + "'" for f in defined_formats]
        bad_format_names = ["'" + f + "'" for f in bad_formats]
        formatstr = "format"
        if len(format_names) > 1:
            format_names = ", ".join(format_names[:-1]) + " and " + format_names[-1]
        if len(bad_format_names) > 1:
            formatstr = "formats"
            bad_format_names = ", ".join(bad_format_names[:-1]) + " and " + bad_format_names[-1]
        logger.warning("Unrecognized save {} {}.".format(formatstr, bad_format_names)
                       + "Recognized formats are " + format_names)
        if len(selected_formats) == 0:
            logger.warning("Setting the format to {}.".format_names)
            # We don't want to throw away the result of a long calculation because of a
            # flag error, so instead we will try to save into every format and let the user
            # sort out the files later.
            format = '+'.join(format_names)

    get_output = None
    def set_str_file(filename):
        nonlocal get_output
        def _get_output(filename, ext, bytes, overwrite):
            return output(filename, ext, bytes, overwrite)
        get_output = _get_output

    # Check argument - file
    if isinstance(file, io.IOBase):
        thisfilename = os.path.realpath(file.name)
        if 'luigi' in os.path.basename(thisfilename):
            # 'file' is actually a Luigi temporary file
            luigi = True
        else:
            luigi = False
        filename = thisfilename  # thisfilename used to avoid name clashes
        if not any(c in file.mode for c in ['w', 'x', 'a', '+']):
            logger.warning("File {} not open for writing; closing and reopening.")
            file.close()
            set_str_file(thisfilename)
        else:
            def _get_output(filename, ext, bytes, overwrite):
                # Check that the file object is compatible with the arguments,
                # and if succesful, just return the file object unmodified.
                # If it is not successful, revert to opening a file as though
                # a filename was passed to `save`.
                # TODO: Put checks in `dummy_file_context`
                fail = False
                if (os.path.splitext(os.path.realpath(filename))[0]
                    != os.path.splitext(os.path.realpath(thisfilename))[0]):
                    logger.warning("[iotools.save] Given filename and file object differ.")
                    fail = True
                thisext = os.path.splitext(thisfilename)[1].strip('.')
                if not luigi and thisext != ext.strip('.'):
                    # Luigi adds 'luigi' to extensions of temporary files; we
                    # don't want that to trigger closing the file
                    logger.warning("[iotools.save] File object has wrong extension.")
                    fail = True
                if (bytes and 'b' not in file.mode
                    or not bytes and 'b' in file.mode):
                    if luigi:
                        # Luigi's LocalTarget always saves to bytes, and it's
                        # the Format class that takes care of converting data
                        # (possibly text) to and back from bytes.
                        logger.warning("\n"
                            "WARNING [iotools]: Attempted to save a 'luigi' target with the wrong "
                            "mode (binary or text). Note that Luigi targets "
                            "always use the same mode internally; use the "
                            "`format` argument to convert to/from in your code. "
                            "In particular, LocalTarget writes in binary. "
                            "Consequently, the file will not be saved as {}, "
                            "but as {}; specify the correct value to `bytes` "
                            "to avoid this message.\n"
                            .format("bytes" if bytes else "text",
                                    "text" if bytes else "bytes"))
                    else:
                        logger.warning("[iotools.save] File object has incorrect byte mode.")
                        fail = True
                if (overwrite and 'a' in file.mode):
                    # Don't check for `not overwrite`: in that case the damage is already done
                    logger.warning("[iotools.save] File object unable to overwrite.")
                    fail = True
                if fail:
                    logger.warning("[iotools.save] Closing and reopening file object.")
                    file.close()
                    set_str_file(thisfilename)
                    return output(filename, ext, bytes, overwrite)
                else:
                    return dummy_file_context(file)
            get_output = _get_output
    else:
        assert isinstance(file, PathTypes)
        filename = file
        set_str_file(file)

    # Ensure target directory exists
    dirname = os.path.dirname(filename)
    if dirname != "":
        os.makedirs(dirname, exist_ok=True)

    output_paths = []

    # If data provides a "save" method, use that
    # This overrides the "format" argument – only exception is if save fails,
    # then we reset it to what it was and try the other formats
    if isinstance(data, ParameterSet):
        # Special case of data with `save` attribute
        _selected_formats_back = selected_formats
        selected_formats = []  # Don't save to another format if successful
        with get_output(filename, ext="", bytes=False, overwrite=overwrite) as (f, output_path):
            # Close the file since Parameters only accepts urls as filenames
            # FIXME: This introduces a race condition; should use `f` to save
            #        This would require fixing the parameters package to
            #        accept file objects in `save()`
            pass
        try:
            logger.info("Saving ParameterSet using its own `save` method...")
            data.save(output_path, expand_urls=True)
        except (AttributeError, PermissionError) as e:
            logger.warning("Calling the data's `save` method failed with '{}'."
                           .format(str(e)))
            selected_formats = _selected_formats_back
        else:
            output_paths.append(output_path)
    elif hasattr(data, 'save'):
        _selected_formats_back = selected_formats
        selected_formats = []  # Don't save to another format if successful
        # See if this type is in the registered formats, so we can get the
        # expected extension
        typename = find_registered_typename(data)
            # Always returns a type name: if none is found, returns that of data
        format = _format_types.get(typename, None)
        if format is None or format not in defined_formats:
            ext = ""
        else:
            ext = defined_formats[format].ext
        with get_output(filename, ext=ext, bytes=False, overwrite=overwrite) as (f, output_path):
            # TODO: Use `f` if possible, and only `output_path` if it fails.
            pass
        try:
            logger.info("Saving data using its own `save` method...")
            data.save(output_path)
        except (AttributeError, PermissionError) as e:
            logger.warning("Calling the data's `save` method failed with '{}'."
                           .format(str(e)))
            selected_formats = _selected_formats_back
        else:
            output_paths.append(output_path)

    # Save to all specified formats
    for name, formatinfo in defined_formats.items():
        if name in ('npr', 'repr', 'brepr', 'dill'):
            # TODO: Define the save functions below at top level of module
            # and treat these formats as any other
            #       Make sure 'dill' is still used as backup
            continue
        if name in selected_formats:
            if formatinfo.save is None:
                logger.error("Format '{}' does not define a save function"
                             .format(name))
                fail = True
            else:
                fail = False
                ext = formatinfo.ext
                try:
                    with get_output(filename, ext, formatinfo.bytes, overwrite) as (f, output_path):
                        formatinfo.save(f, data)
                except IOError:
                    fail = True
                except Exception as e:
                    logger.error("Silenced uncaught exception during saving process to attempt another format.")
                    logger.error("Silenced exception was: " + str(e))
                    fail = True
                else:
                    output_paths.append(output_path)
            if fail:
                try: os.remove(output_path)  # Ensure there are no leftover files
                except: pass
                logger.warning("Unable to save to {} format."
                               .format(name))
                if 'dill' not in selected_formats:
                    # Warn the user that we will use another format
                    logger.warning("Will try a plain (dill) pickle dump.")
                    selected_formats.add('dill')
    # Save data as numpy representation
    if 'npr' in selected_formats:
        fail = False
        ext = defined_formats['npr'].ext
        try:
            with get_output(filename, ext, True, overwrite) as (f, output_path):
                try:
                    logger.info("Saving data to 'npr' format...")
                    np.savez(f, **data.repr_np)
                except AttributeError:
                    fail = True
                else:
                    output_paths.append(output_path)
        except IOError:
            fail = True
        if fail:
            # TODO: Use custom error type
            try: os.remove(output_path)  # Ensure there are no leftover files
            except: pass
            logger.warning("Unable to save to numpy representation ('npr') format.")
            if 'dill' not in selected_formats:
                # Warn the user that we will use another format
                logger.warning("Will try a plain (dill) pickle dump.")
                selected_formats.add('dill')

    # Save data as representation string ('repr' or 'brepr')
    for format in [format
                   for format in selected_formats
                   if format in ('repr', 'brepr')]:
        bytes = (format == 'brepr')
        fail = False
        if data.__repr__ is object.__repr__:
            # Non-informative repr -- abort
            fail = True
        else:
            ext = defined_formats['repr'].ext
            try:
                with get_output(filename, ext=ext, bytes=bytes, overwrite=overwrite) as (f, output_path):
                    try:
                        logger.info("Saving data to plain-text 'repr' format'")
                        f.write(repr(data))
                    except:
                        fail = True
                    else:
                        output_paths.append(output_path)
            except IOError:
                fail = True
        if fail:
            try: os.remove(output_path)  # Ensure there are no leftover files
            except: pass
            logger.warning("Unable to save to numpy representation ('npr') format.")
            if 'dill' not in selected_formats:
                # Warn the user that we will use another format
                logger.warning("Will try a plain (dill) pickle dump.")
                selected_formats.add('dill')

    # Save data in dill format
    if 'dill' in selected_formats:
        ext = defined_formats['dill'].ext
        try:
            with get_output(filename, ext, True, overwrite) as (f, output_path):
                logger.info("Saving data as a dill pickle.")
                dill.dump(data, f)
                output_paths.append(output_path)
        except IOError:
            # There might be other things to save, so don't terminate
            # execution because this save failed
            try: os.remove(output_path)  # Ensure there are no leftover files
            except: pass
            logger.warning("Unable to save picke at location {}."
                           .format(output_path))

    # Return the list of output paths
    return [Path(path) for path in output_paths]
Exemple #38
0
                    def handle(self):
                        "Called by TCPServer for each client connection request"
                        try:
                            while True:
                                msg = custompickle.load(self.rfile);
                                #logging.debug("ROProxy {}  {:0.20f}".format(msg, time.time()));

                                #First message from client stub, check if object exists or not.
                                if(msg == ROMessages._INIT_):
                                    robjName = custompickle.load(self.rfile);
                                    #logging.debug("_INIT_ message to look for object {}".format(robjName));
                                    if(ROMgrObj.has(robjName)):
                                        self.obj = ROMgrObj.get(robjName, self);
                                        #On success, send the id of the proxy.
                                        custompickle.dump(id(self), self.wfile); self.wfile.flush();
                                        self._robjName = robjName;
                                    else:
                                        logging.warning("_INIT_ message object {} not found".format(robjName));
                                        custompickle.dump(ROMessages._NOT_FOUND_, self.wfile); self.wfile.flush();
                                #Check if the return should be compressed or not.
                                elif(msg != ROMessages._COMPRESS_):
                                    #logging.debug("RemoteMethod: {} is not a compress directive.".format(msg));
                                    #Request for an attribute
                                    if(msg == ROMessages._GET_ATTRIBUTE_):
                                        item = custompickle.load(self.rfile);
                                        try:
                                            val = self.obj.__getattribute__(item);
                                            custompickle.dump(None,self.wfile); custompickle.dump(val, self.wfile); self.wfile.flush();
                                        except Exception as e:
                                            #An exception occured. send traceback info the client stub.
                                            custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush();
                                    #Regular client stub messages contain the name of the function to be invoked and any arguments.
                                    else:
                                        #logging.debug("ROProxy {} reading args time {:0.20f}".format(msg, time.time()));
                                        args   = custompickle.load(self.rfile); kwargs = custompickle.load(self.rfile);
                                        #logging.debug("ROProxy {} read args time {:0.20f}".format(msg, time.time()));

                                        #Execute the function locally and send back any results/exceptions.
                                        try:
                                            #Execute the local function, store the results.
                                            func = self.obj.__getattribute__(msg);
                                            if(inspect.ismethod(func)):
                                                result = func(*args, **kwargs);
                                                args = kwargs = None;
                                            else: #This is probably a property, in which case we already have the value, return it.
                                                result = func;
                                            #logging.debug("ROProxy {} local result time {:0.20f}".format(msg, time.time()));

                                            #No exception to report.
                                            custompickle.dump(None,self.wfile);#self.wfile.flush();
                                            #logging.debug("ROProxy {} exception send time {:0.20f}".format(msg, time.time()));
                                            #Return the results.
                                            custompickle.dump(result, self.wfile); self.wfile.flush();
                                            #logging.debug("ROProxy {} result send time {:0.20f}".format(msg, time.time()));
                                            #Hand shake to make sure this function scope is active till the other side has setup remote object stubs if any
                                            #the contents of this message is irrelevant to us.
                                            #NOT REQUIRED: this object reference (result) is alive in this space till next remote function call reaches it.
                                            #custompickle.load(self.rfile);
                                        except Exception as e:
                                            #An exception occured. send traceback info the client stub.
                                            custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush();
                                else:
                                    msg = custompickle.load(self.rfile);
                                    #logging.debug("RemoteMethod : request for compressing {}".format(msg));
                                    #Request for an attribute
                                    if(msg == ROMessages._GET_ATTRIBUTE_):
                                        item = custompickle.load(self.rfile);
                                        try:
                                            val = self.obj.__getattribute__(item);
                                            custompickle.dump(None, self.wfile); self.wfile.flush();
                                            AConfig.NTWKCHANNEL.transmit(val, self.wfile);
                                        except Exception as e:
                                            #An exception occured. send traceback info the client stub.
                                            custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush();
                                    #Regular client stub messages contain the name of the function to be invoked and any arguments.
                                    else:
                                        #logging.debug("ROProxy {} reading args time {:0.20f}".format(msg, time.time()));
                                        args   = custompickle.load(self.rfile); kwargs = custompickle.load(self.rfile);
                                        #logging.debug("ROProxy {} read args time {:0.20f}".format(msg, time.time()));

                                        #Execute the function locally and send back any results/exceptions.
                                        try:
                                            #Execute the local function, store the results.
                                            func = self.obj.__getattribute__(msg);
                                            if(inspect.ismethod(func)):
                                                result = func(*args, **kwargs);
                                                args = kwargs = None;
                                            else: #This is probably a property, in which case we already have the value, return it.
                                                result = func;
                                            #logging.debug("ROProxy {} local result time {:0.20f}".format(msg, time.time()));

                                            #No exception to report.
                                            custompickle.dump(None,self.wfile);self.wfile.flush();
                                            #logging.debug("ROProxy {} exception send time {:0.20f}".format(msg, time.time()));
                                            #Return the results.
                                            AConfig.NTWKCHANNEL.transmit(result, self.wfile);
                                            #logging.debug("ROProxy {} result send time {:0.20f}".format(msg, time.time()));
                                            #Hand shake to make sure this function scope is active till the other side has setup remote object stubs if any
                                            #the contents of this message is irrelevant to us.
                                            #NOT REQUIRED: this object reference (result) is alive in this space till next remote function call reaches it.
                                            #custompickle.load(self.rfile);
                                        except Exception as e:
                                            #An exception occured. send traceback info the client stub.
                                            custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush();
                                #logging.debug("ROProxy {} exit time {:0.20f}".format(msg, time.time()));

                        except EOFError:
                            pass;
Exemple #39
0
 def save(self, fnm):
     with open(fnm, "wb") as f:
         pickle.dump((self.serialize(), self.stats), f)
    args = parser.parse_args()
    catalog = load_catalog(CATALOG_NAME)
    if args.full:
        title_set = None
    else:
        print('limiting to books from', CATALOG_NAME)
        title_set = set(catalog.keys())

    if args.archived_list:
        books_list = get_archived(BOOKS_LIST)
    else:
        books_list = BOOKS_LIST
    title_url_map = get_title_url_map(books_list, title_set=title_set)
    print('{} book pages total'.format(len(title_url_map)))
    book_summaries = get_summaries(title_url_map, args.out_name,
                                   args.use_pickled, args.archived,
                                   args.update_old, args.save_every,
                                   args.sleep)
    # with open(args.out_name, 'rb') as f:
    #     book_summaries = pickle.load(f)

    book_summaries_overlap = gen_gutenberg_overlap(book_summaries,
                                                   catalog,
                                                   filter_plays=True)
    book_summaries_overlap = manual_fix(book_summaries_overlap)
    book_summaries_overlap = manual_fix_individual(book_summaries_overlap)

    with open(args.out_name_overlap, 'wb') as f:
        pickle.dump(book_summaries_overlap, f)
    print('wrote to {}'.format(args.out_name_overlap))
Exemple #41
0
def main():
    """Runs the experiment."""

    parser = argparse.ArgumentParser(
        description='Set up searching for sub-types to detect.')

    # positional command line arguments
    parser.add_argument('cohort', type=str, help='a TCGA cohort')
    parser.add_argument('classif',
                        type=str,
                        help='a classifier in HetMan.predict.classifiers')
    parser.add_argument('base_gene',
                        type=str,
                        help='a gene to cross with respect to')

    # optional command line arguments controlling the thresholds for which
    # individual mutations and how many genes' mutations are considered
    parser.add_argument('--freq_cutoff',
                        type=int,
                        default=10,
                        help='sub-type sample frequency threshold')
    parser.add_argument('--max_genes',
                        type=int,
                        default=20,
                        help='maximum number of mutated genes to consider')

    # optional command line arguments for what kinds of mutation sub-types to
    # look for in terms of properties and number of mutations to combine
    parser.add_argument(
        '--mut_levels',
        type=str,
        nargs='+',
        default=['Form_base', 'Exon', 'Protein'],
        help='the mutation property levels to consider in addition to `Genes`')
    parser.add_argument(
        '--comb_size',
        type=int,
        default=3,
        help='maximum number of individual mutations to combine'
        'when searching for mutation sub-types')

    # optional command line argument controlling verbosity
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help='turns on diagnostic messages')

    # parse the command line arguments, get the directory where found sub-types
    # will be saved for future use
    args = parser.parse_args()
    out_path = os.path.join(base_dir, 'output', args.cohort, args.classif,
                            'add', args.base_gene)

    if args.verbose:
        print("Looking for mutation sub-types in cohort {} composed of at "
              "most {} individual mutations with at least {} "
              "samples in total.\n".format(args.cohort, args.comb_size,
                                           args.freq_cutoff))

    # log into Synapse using locally-stored credentials
    syn = synapseclient.Synapse()
    syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/"
                                "mgrzad/input-data/synapse")
    syn.login()

    # load the expression matrix for the given cohort from Broad Firehose,
    # load the MC3 variant call set from Synapse, find the mutations for the
    # samples that are in both datasets
    expr_data = get_expr_firehose(args.cohort, firehose_dir)
    mc3_data = get_variants_mc3(syn)

    expr_mc3 = mc3_data.loc[mc3_data['Sample'].isin(expr_data.index), :]
    gene_mc3 = expr_mc3.loc[expr_mc3['Gene'] == args.base_gene, :]
    expr_mc3 = expr_mc3.loc[~expr_mc3['Sample'].isin(gene_mc3['Sample']), :]

    # get the genes whose mutations appear in enough samples to pass the
    # frequency threshold
    gene_counts = expr_mc3.groupby(by='Gene').Sample.nunique()
    count_cutoff = int(args.freq_cutoff / args.comb_size)
    common_genes = set(gene_counts.index[gene_counts >= count_cutoff])

    if args.verbose:
        print("Found {} candidate genes with at least {} potential "
              "mutated samples.".format(len(common_genes), count_cutoff))

    if len(common_genes) >= args.max_genes:
        gene_counts = gene_counts[common_genes].sort_values(ascending=False)
        common_genes = set(gene_counts[:args.max_genes].index)

        if args.verbose:
            print("Too many genes found, culling list to {} genes which each "
                  "have at least {} mutated samples.".format(
                      args.max_genes, min(gene_counts[common_genes])))

    cdata = VariantCohort(cohort=args.cohort,
                          mut_genes=common_genes,
                          mut_levels=['Gene'] + args.mut_levels,
                          expr_source='Firehose',
                          data_dir=firehose_dir,
                          cv_prop=1.0,
                          syn=syn)

    # intializes the list of found sub-types and the list of samples each
    # sub-type appears in
    use_mtypes = set()
    use_sampsets = set()

    search_level = 1
    break_status = False

    # until we have not reached the limit of sub-type enumeration or run out
    # property level combinations to test...
    while (len(use_mtypes) < 6000 and not break_status
           and search_level <= 2**len(args.mut_levels)):

        # try a list of property level combinations and number of individual
        # variants to combine, where the complexity of the level combination
        # plus the variant count is held constant
        for lvl_combn, comb_size in zip(
                rev_powerset_slice(args.mut_levels, search_level),
                range(1, min(search_level + 1, args.comb_size + 1))):
            use_lvls = ['Gene'] + list(lvl_combn)

            if args.verbose:
                print("\nLooking for sub-types that are combinations "
                      "of {} mutation(s) at levels {}...\n".format(
                          comb_size, use_lvls))

            # enumerates the sub-types consisting of a combination of the given
            # number of individual mutations at the given property levels
            sub_mtypes = cdata.train_mut.combtypes(
                comb_sizes=(comb_size, ),
                sub_levels=use_lvls,
                min_type_size=args.freq_cutoff)

            # finds the samples belonging to each enumerated sub-type that
            # hasn't already been found
            mtype_sampsets = {
                mtype: frozenset(mtype.get_samples(cdata.train_mut))
                for mtype in sub_mtypes - use_mtypes
            }

            # removes the sub-types with so many mutated samples that there
            # are not enough negatively-labelled samples for classification
            mtype_sampsets = {
                mtype: sampset
                for mtype, sampset in mtype_sampsets.items()
                if len(sampset) <= (len(cdata.samples) - args.freq_cutoff)
            }

            sub_mtypes = sorted(list(mtype_sampsets))
            if args.verbose:
                print("Found {} new sub-types!\n".format(len(sub_mtypes)))

            # if the list of remaining sub-types isn't too long...
            if len(sub_mtypes) < 5000:
                add_mtypes = set()

                for i, mtype in enumerate(sub_mtypes):
                    if args.verbose and (i % 200) == 100:
                        print("\nchecked {} sub-types\n".format(i))

                    # ...we remove each one whose set of mutated samples is
                    # identical to that of a sub-type that was already found
                    if mtype_sampsets[mtype] in use_sampsets:
                        if args.verbose:
                            print("Removing functionally duplicate MuType {}"\
                                    .format(mtype))

                    else:
                        add_mtypes.update({mtype})
                        use_sampsets.update({mtype_sampsets[mtype]})

                use_mtypes |= add_mtypes

            elif len(sub_mtypes) > 60000:
                break_status = True

        search_level += 1

    if args.verbose:
        print("\nFound {} total sub-types!".format(len(use_mtypes)))

    # save the list of found non-duplicate sub-types to file
    pickle.dump(sorted(list(use_mtypes)),
                open(os.path.join(out_path, 'tmp/mtype_list.p'), 'wb'))
Exemple #42
0
    return train_count, test_count, correct_count, output


if __name__ == '__main__':
    opts = utils.getopt_for_naocanzhujiao(sys.argv[1:])
    if '-g' in opts:
        print('Group L01: Azeri')
        print('Guo Yanzhe, 2571732')
        print('Zhai Fangzhou, 2566641')
        print('Zhu Dawei, 2549931')
        exit(0)
    if '-tr' in opts:
        ''' update train file path '''
        train_file, test_file = dill.load(open(config.config_file, 'rb'))
        dill.dump((opts['-tr'], test_file), open(config.config_file, 'wb'))
    if '-te' in opts:
        ''' update test file path '''
        train_file, test_file = dill.load(open(config.config_file, 'rb'))
        dill.dump((train_file, opts['-te']), open(config.config_file, 'wb'))
    if '-a' in opts:
        ''' perform task 1 and evaluate accuracy '''
        # load data
        train_file, test_file = dill.load(open(config.config_file, 'rb'))
        train_data = utils.load_data(train_file)
        test_data = utils.load_data(test_file)
        ''' perform inflection '''
        tr_c, te_c, co_c, _ = batch_inflect(train_data, test_data)
        ''' output accuracy '''
        print('trained on: ' + train_file)
        print('- training instances : ' + str(tr_c))
Exemple #43
0
def main():
    parser = argparse.ArgumentParser(
        'merge_test',
        description="Concatenates all of the output of an experiment.")

    # collect command line arguments
    parser.add_argument('use_dir', type=str)
    args = parser.parse_args()

    # load list of subgrouping tasks for this experiment
    with open(os.path.join(args.use_dir, 'setup', "muts-list.p"), 'rb') as f:
        muts_list = pickle.load(f)

    # concatenate cohort mutated statuses for each subgrouping
    pheno_dict = dict()
    for pheno_file in Path(args.use_dir, 'merge').glob("out-pheno_*.p.gz"):
        with bz2.BZ2File(pheno_file, 'r') as fl:
            pheno_dict.update(pickle.load(fl))

    assert sorted(muts_list) == sorted(pheno_dict.keys()), (
        "Tested mutations missing from list of mutations' sample statuses!")
    assert len({
        len(phns)
        for phns in pheno_dict.values()
    }) == 1, ("Inconsistent number of samples across mutation phenotype data!")

    with bz2.BZ2File(os.path.join(args.use_dir, "out-pheno.p.gz"), 'w') as fl:
        pickle.dump(pheno_dict, fl, protocol=-1)

    # concatenate coefficient values for each subgrouping classification model
    coef_df = pd.DataFrame()
    for coef_file in Path(args.use_dir, 'merge').glob("out-coef_*.p.gz"):
        with bz2.BZ2File(coef_file, 'r') as fl:
            coef_data = pickle.load(fl)
        coef_df = pd.concat([coef_df, coef_data.sort_index(axis=1)])

    assert sorted(muts_list) == sorted(coef_df.index), (
        "Tested mutations missing from merged classifier coefficients!")
    with bz2.BZ2File(os.path.join(args.use_dir, "out-coef.p.gz"), 'w') as fl:
        pickle.dump(coef_df, fl, protocol=-1)

    # concatenate predicted labels made by each subgrouping model
    pred_df = pd.DataFrame()
    for pred_file in Path(args.use_dir, 'merge').glob("out-pred_*.p.gz"):
        with bz2.BZ2File(pred_file, 'r') as fl:
            pred_data = pickle.load(fl)
        pred_df = pd.concat([pred_df, pred_data])

    assert sorted(muts_list) == sorted(pred_df.index), (
        "Tested mutations missing from merged classifier predictions!")
    with bz2.BZ2File(os.path.join(args.use_dir, "out-pred.p.gz"), 'w') as fl:
        pickle.dump(pred_df, fl, protocol=-1)

    # concatenate subgrouping model tuning performances
    tune_dfs = [pd.DataFrame() for _ in range(3)] + [None]
    for tune_file in Path(args.use_dir, 'merge').glob("out-tune_*.p.gz"):
        with bz2.BZ2File(tune_file, 'r') as fl:
            tune_data = pickle.load(fl)

        if tune_dfs[3] is None:
            tune_dfs[3] = tune_data[3]
        else:
            assert tune_dfs[3] == tune_data[3], (
                "Inconsistent mutation classifiers between gather tasks!")

        for i in range(3):
            tune_dfs[i] = pd.concat([tune_dfs[i], tune_data[i]])

    for i in range(3):
        assert sorted(muts_list) == sorted(tune_dfs[i].index), (
            "Tested mutations missing from merged tuning statistics!")
    with bz2.BZ2File(os.path.join(args.use_dir, "out-tune.p.gz"), 'w') as fl:
        pickle.dump(tune_dfs, fl, protocol=-1)

    # concatenate subgrouping model testing performances
    auc_df = pd.DataFrame()
    for auc_file in Path(args.use_dir, 'merge').glob("out-aucs_*.p.gz"):
        with bz2.BZ2File(auc_file, 'r') as fl:
            auc_data = pickle.load(fl)
        auc_df = pd.concat([auc_df, pd.DataFrame(auc_data)])

    assert sorted(muts_list) == sorted(auc_df.index), (
        "Tested mutations missing from merged classifier accuracies!")
    with bz2.BZ2File(os.path.join(args.use_dir, "out-aucs.p.gz"), 'w') as fl:
        pickle.dump(auc_df, fl, protocol=-1)

    # concatenate subgrouping model sub-sampled testing performances
    conf_list = pd.Series(dtype='object')
    for conf_file in Path(args.use_dir, 'merge').glob("out-conf_*.p.gz"):
        with bz2.BZ2File(conf_file, 'r') as fl:
            conf_data = pickle.load(fl)
        conf_list = conf_list.append(conf_data)

    assert sorted(muts_list) == sorted(conf_list.index), (
        "Tested mutations missing from merged subsampled accuracies!")
    with bz2.BZ2File(os.path.join(args.use_dir, "out-conf.p.gz"), 'w') as fl:
        pickle.dump(conf_list, fl, protocol=-1)

    # concatenate model performances when transferred to other cohorts
    trnsf_preds = pd.DataFrame()
    for trnsf_file in Path(args.use_dir, 'merge').glob("trnsf-vals_*.p.gz"):
        with bz2.BZ2File(trnsf_file, 'r') as fl:
            trnsf_vals = pickle.load(fl)
        trnsf_preds = pd.concat([trnsf_preds, trnsf_vals])

    assert sorted(muts_list) == sorted(trnsf_preds.index), (
        "Tested mutations missing from merged transfer predictions!")
    with bz2.BZ2File(os.path.join(args.use_dir, "trnsf-preds.p.gz"),
                     'w') as fl:
        pickle.dump(trnsf_preds, fl, protocol=-1)

    trnsf_dict = dict()
    for trnsf_file in Path(args.use_dir, 'merge').glob("out-trnsf_*.p.gz"):
        with bz2.BZ2File(trnsf_file, 'r') as fl:
            trnsf_data = pickle.load(fl)

        for coh, trnsf_out in trnsf_data.items():
            if coh not in trnsf_dict:
                trnsf_dict[coh] = {
                    'Samps': None,
                    'Pheno': dict(),
                    'AUC': pd.DataFrame()
                }

            if trnsf_dict[coh]['Samps'] is None:
                trnsf_dict[coh]['Samps'] = trnsf_out['Samps']
            else:
                assert trnsf_dict[coh]['Samps'] == trnsf_out['Samps'], (
                    "Mismatching sample sets in tranfer cohort `{}`!".format(
                        coh))

            if coh != 'CCLE':
                trnsf_dict[coh]['Pheno'].update(trnsf_out['Pheno'])
                trnsf_dict[coh]['AUC'] = pd.concat(
                    [trnsf_dict[coh]['AUC'],
                     pd.DataFrame(trnsf_out['AUC'])])

    with bz2.BZ2File(os.path.join(args.use_dir, "out-trnsf.p.gz"), 'w') as fl:
        pickle.dump(trnsf_dict, fl, protocol=-1)
    def _hyperparameter_optimization(self,
                                     num_iterations=30,
                                     save_results=True,
                                     display_plot=False,
                                     batch_size=20,
                                     n_random_starts=10,
                                     use_TPU=False,
                                     transfer_model='Inception',
                                     cutoff_regularization=False,
                                     min_accuracy=None):
        """
        min_accuracy: minimum value of categorical accuracy we want after 1 iteration
        num_iterations: number of hyperparameter combinations we try
        n_random_starts: number of random combinations of hyperparameters first tried
        """
        self.min_accuracy = min_accuracy
        self.batch_size = batch_size
        self.use_TPU = use_TPU
        self.transfer_model = transfer_model
        self.cutoff_regularization = cutoff_regularization

        #import scikit-optimize libraries
        from skopt import gp_minimize
        from skopt.space import Real, Categorical, Integer
        from skopt.plots import plot_convergence
        from skopt.utils import use_named_args

        #declare the hyperparameters search space
        dim_epochs = Integer(low=1, high=10, name='epochs')
        dim_hidden_size = Integer(low=6, high=2048, name='hidden_size')
        dim_learning_rate = Real(low=1e-6,
                                 high=1e-2,
                                 prior='log-uniform',
                                 name='learning_rate')
        dim_dropout = Real(low=0, high=0.9, name='dropout')
        dim_fine_tuning = Categorical(categories=[True, False],
                                      name='fine_tuning')
        dim_nb_layers = Integer(low=1, high=3, name='nb_layers')
        dim_activation = Categorical(categories=['relu', 'tanh'],
                                     name='activation')
        dim_include_class_weight = Categorical(categories=[True, False],
                                               name='include_class_weight')

        dimensions = [
            dim_epochs, dim_hidden_size, dim_learning_rate, dim_dropout,
            dim_fine_tuning, dim_nb_layers, dim_activation,
            dim_include_class_weight
        ]

        #read default parameters from last optimization
        try:
            with open(
                    parentdir +
                    '/data/trained_model/hyperparameters_search.pickle',
                    'rb') as f:
                sr = dill.load(f)
            default_parameters = sr.x
            print('parameters of previous optimization loaded!')

        except:
            #fall back default values
            default_parameters = [5, 1024, 1e-4, 0, True, 1, 'relu', True]

        self.number_iterations = 0

        #declare the fitness function
        @use_named_args(dimensions=dimensions)
        def fitness(epochs, hidden_size, learning_rate, dropout, fine_tuning,
                    nb_layers, activation, include_class_weight):

            self.number_iterations += 1

            #print the hyper-parameters
            print('epochs:', epochs)
            print('hidden_size:', hidden_size)
            print('learning rate:', learning_rate)
            print('dropout:', dropout)
            print('fine_tuning:', fine_tuning)
            print('nb_layers:', nb_layers)
            print('activation:', activation)
            print('include_class_weight', include_class_weight)
            print()

            #fit the model
            self.fit(epochs=epochs,
                     hidden_size=hidden_size,
                     learning_rate=learning_rate,
                     dropout=dropout,
                     fine_tuning=fine_tuning,
                     nb_layers=nb_layers,
                     activation=activation,
                     include_class_weight=include_class_weight,
                     batch_size=self.batch_size,
                     use_TPU=self.use_TPU,
                     transfer_model=self.transfer_model,
                     min_accuracy=self.min_accuracy,
                     cutoff_regularization=self.cutoff_regularization)

            #extract fitness
            fitness = self.fitness

            print('CALCULATED FITNESS AT ITERATION', self.number_iterations,
                  'OF:', fitness)
            print()

            del self.model
            K.clear_session()

            return -1 * fitness

        # optimization
        self.search_result = gp_minimize(
            func=fitness,
            dimensions=dimensions,
            acq_func='EI',  # Expected Improvement.
            n_calls=num_iterations,
            n_random_starts=n_random_starts,
            x0=default_parameters)

        if save_results:
            if not os.path.exists(parentdir + '/data/trained_models'):
                os.makedirs(parentdir + '/data/trained_models')

            with open(
                    parentdir +
                    '/data/trained_models/hyperparameters_dimensions.pickle',
                    'wb') as f:
                dill.dump(dimensions, f)

            with open(
                    parentdir +
                    '/data/trained_models/hyperparameters_search.pickle',
                    'wb') as f:
                dill.dump(self.search_result.x, f)

            print("Hyperparameter search saved!")

        if display_plot:
            plot_convergence(self.search_result)

        #build results dictionary
        results_dict = {
            dimensions[i].name: self.search_result.x[i]
            for i in range(len(dimensions))
        }
        print('Optimal hyperameters found of:')
        print(results_dict)
        print()
        print('Optimal fitness value of:', -float(self.search_result.fun))
Exemple #45
0
    def saveProgress(self):
        saveFile = open("./" + self.player.name.lower() + ".txt", 'wb')
        print "Saving . . . "
        world = {
            "name": self.name,
            "description": self.description,
            "areas": [],
            "player": {
                "name": self.player.name,
                "description": self.player.description,
                "currentArea": self.player.currentArea.name,
                "health": self.player.health,
                "score": self.player.score,
                "inventory": []
            }
        }

        #save items in players inventory
        for item in self.player.inventory:
            currItem = {
                "name": item.name,
                "description": item.description,
                "area": None,
                "moveable": item.moveable,
                "onSuccess": item.onSuccess,
                "onFailure": item.onFailure,
                "detailedDescription": item.detailedDescription,
                "onSuccessScripts": [],
                "onFailureScripts": [],
                "onUse": item.onUse,
                "onUseScripts": []
            }
            #saving item scripts for items in players inventory
            for script in item.onSuccessScripts:
                serScript = pickle.dumps(script)
                currScript = {"script": serScript}
                currItem["onSuccessScripts"].append(currScript)
            for script in item.onFailureScripts:
                serScript = pickle.dumps(script)
                currScript = {"script": serScript}
                currItem["onFailureScripts"].append(currScript)
            for script in item.onUseScripts:
                serScript = pickle.dumps(script)
                currScript = {"script": serScript}
                currItem["onUseScripts"].append(currScript)

            world["player"]["inventory"].append(currItem)

        #save all the areas their transitions, and their items.
        for area in self.areas:
            currArea = {
                "name": area.name,
                "description": area.description,
                "transitions": [],
                "items": []
            }
            for transition in area.transitions:
                currTransition = {
                    "name": transition.name,
                    "direction": transition.direction,
                    "isPassable": transition.isPassable,
                    "onSuccess": transition.onSuccess,
                    "onFailure": transition.onFailure,
                    "destination": transition.destination.name,
                    "area": transition.area.name,
                    "description": transition.description,
                    "detailedDescription": transition.detailedDescription,
                    "openedDescription": transition.openedDescription,
                    "onSuccessScripts": [],
                    "onFailureScripts": [],
                    "onOpenScripts": [],
                    "requirements": []
                }

                #serializing scripts with pickle.
                for script in transition.onSuccessScripts:
                    serScript = pickle.dumps(script)
                    currScript = {"script": serScript}
                    currTransition["onSuccessScripts"].append(currScript)
                for script in transition.onFailureScripts:
                    serScript = pickle.dumps(script)
                    currScript = {"script": serScript}
                    currTransition["onFailureScripts"].append(currScript)
                for script in transition.onOpenScripts:
                    serScript = pickle.dumps(script)
                    currScript = {"script": serScript}
                    currTransition["onOpenScripts"].append(currScript)

                #saving transition requirements
                for requirement in transition.requirements:
                    currRequirement = {"requirement": requirement}
                    currTransition["requirements"].append(currRequirement)

                currArea["transitions"].append(currTransition)
            for item in area.items:
                currItem = {
                    "name": item.name,
                    "description": item.description,
                    "detailedDescription": item.detailedDescription,
                    "area": area.name,
                    "moveable": item.moveable,
                    "onSuccess": item.onSuccess,
                    "onFailure": item.onFailure,
                    "onSuccessScripts": [],
                    "onFailureScripts": [],
                    "onUse": item.onUse,
                    "onUseScripts": []
                }
                for script in item.onSuccessScripts:
                    serScript = pickle.dumps(script)
                    currScript = {"script": serScript}
                    currItem["onSuccessScripts"].append(currScript)
                for script in item.onFailureScripts:
                    serScript = pickle.dumps(script)
                    currScript = {"script": serScript}
                    currItem["onFailureScripts"].append(currScript)
                for script in item.onUseScripts:
                    serScript = pickle.dumps(script)
                    currScript = {"script": serScript}
                    currItem["onUseScripts"].append(currScript)

                currArea["items"].append(currItem)
            world["areas"].append(currArea)

        world = pickle.dump(world, saveFile)
        #world = json.dumps(world,indent=4, separators=(',', ': '))
        saveFile.close()
        print "Progress saved in " + self.player.name + ".txt"
Exemple #46
0
  def lowercase(text):
    return text.lower()

  def expand_contractions(text):
    text = text.split()
    return ' '.join(list(map(lambda word: contractions[word] if word in contractions_keys else word, text)))

  def remove_symbols_punctuation(text):
    text = re.sub(delete_re_symbols.pattern, '', text)
    text = re.sub(replace_re_by_space.pattern, ' ', text)
    return text

  def remove_stop_words(text):
    text = text.split()
    filtered_sentence = [w for w in text if not w in stop_words]
    return filtered_sentence

  def text_lemmatization(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    text = list(map(lambda word: wordnet_lemmatizer.lemmatize(word), text))
    return text

  text = expand_contractions(text)
  text = lowercase(text)
  text = remove_symbols_punctuation(text)
  text = remove_stop_words(text)
  text = text_lemmatization(text)
  return ' '.join(text)

dill.dump(process_text, open('serialized/process_text.sav', 'wb'))
Exemple #47
0
            gt_entry,
            pred_entry,
        )

        ########################################################
        fp = predict(gt_entry['gt_boxes'], gt_entry['gt_classes'])
        fp_pred = fp[all_rels[:, 0], all_rels[:, 1]]

        pred_cls_scores = fp_pred.max(1)
        pred_cls_inds = np.argsort(-pred_cls_scores)
        pred_cls_inds = pred_cls_inds[pred_cls_scores[pred_cls_inds] > 0][:100]

        pred_entry['pred_rel_inds'] = all_rels[pred_cls_inds]
        pred_entry['rel_scores'] = fp_pred[pred_cls_inds]
        pred_entry['pred_classes'] = gt_entry['gt_classes']
        pred_entry['obj_scores'] = np.ones(pred_entry['pred_classes'].shape[0])

        all_pred_entries['predcls'].append(pred_entry)

        evaluator['predcls'].evaluate_scene_graph_entry(
            gt_entry,
            pred_entry,
        )
    img_offset += img_ids.max() + 1
evaluator['predcls'].print_stats()
evaluator['sgcls'].print_stats()

for mode, entries in all_pred_entries.items():
    with open('caches/freqbaseline-{}-{}.pkl'.format('overlap' if MUST_OVERLAP else 'nonoverlap', mode), 'wb') as f:
        pkl.dump(entries, f)
Exemple #48
0
    # Load the diabetes dataset
    diabetes = datasets.load_diabetes()

    # ONLY USING 1 FEATURE FOR THIS EXAMPLE!
    # Use only one feature
    diabetes_X = diabetes.data[:, np.newaxis, 2]

    # Split the data into training/testing sets
    diabetes_X_train = diabetes_X[:-20]
    diabetes_X_test = diabetes_X[-20:]

    # Split the targets into training/testing sets
    diabetes_y_train = diabetes.target[:-20]
    diabetes_y_test = diabetes.target[-20:]

    # Create linear regression model
    model = linear_model.LinearRegression()

    # Train the model using the training sets
    model.fit(diabetes_X_train, diabetes_y_train)

    import dill as pickle

    pio_bundle = PioBundle(model)

    pio_bundle_pkl_path = 'pio_bundle.pkl'

    with open(pio_bundle_pkl_path, 'wb') as fh:
        pickle.dump(pio_bundle, fh)
 def save(self, file_name):
     dill.dump(self, open(file_name, 'w'))
Exemple #50
0
    def write_state(self, file_no):
        self.file_no = file_no
        files = self.get_file_names(self.file_no)

        with open((files['PATH_TO_DOC_WRITTEN']), 'wb') as f:
            dill.dump(self.doc_written, f)

        # PATH_TO_DOC_COUNT = ROOT /'doc_count_{}'.format(file_no)
        dill.dump(self.doc_count, open(files['PATH_TO_DOC_COUNT'], 'wb'))

        # PATH_TO_INLINKS = ROOT /'INLINKS_{}'.format(file_no)
        dill.dump(self.inlinks, open(files['PATH_TO_INLINKS'], 'wb'))

        # PATH_TO_OUTLINKS = ROOT /'OUTLINKS_{}'.format(file_no)
        dill.dump(self.oulinks, open(files['PATH_TO_OUTLINKS'], 'wb'))

        # PATH_TO_TRAVERSED =  ROOT /'traversed_{}'.format(file_no)
        dill.dump(self.traversed, open(files['PATH_TO_TRAVERSED'], 'wb'))

        # PATH_TO_VISITED = ROOT / 'visited_{}'.format(file_no)
        dill.dump(self.visited, open(files['PATH_TO_VISITED'], 'wb'))

        # PATH_TO_ROBOT_DIC = ROOT / 'robot_dic_{}'.format(file_no)
        dill.dump(self.robot_dict, open(files['PATH_TO_ROBOT_DIC'], 'wb'))

        # PATH_TO_FRONTIER = ROOT / 'frontier_{}'.format(file_no)
        dill.dump(self.frontier, open(files['PATH_TO_FRONTIER'], 'wb'))

        # PATH_TO_AUX_FRONTIER = ROOT / 'aux_frontier_{}'.format(file_no)
        dill.dump(self.aux_ftier, open(files['PATH_TO_AUX_FRONTIER'], 'wb'))

        # PATH_TO_FILE_NUM = ROOT / 'file_num'
        dill.dump(file_no, open(PATH_TO_FILE_NUM, 'wb'))
Exemple #51
0
    pbar = tqdm(total=len(tok_sentence))
    for list_tok in tok_sentence:
        tmp = []
        list_tok = sub_space(list_tok)
        list_tok = sub_lol(list_tok)
        list_tok = pad_sentence(list_tok)
        list_tok = list(map(lambda word: clean.stripping(word), list_tok))
        for tok in list_tok:
            if tok in itos:
                tmp.append(stoi[tok])
            else:
                if tok not in unknown_words:
                    unknown_words[tok] = 1
                else:
                    unknown_words[tok] += 1
                tmp.append(stoi[unk_token])
        new_int_sentence.append(tmp)
        pbar.update(1)
    pbar.close()
    return new_int_sentence


pos_int = np.array(sen2int(pos_tok))
neg_int = np.array(sen2int(neg_tok))

# print(pos_int.shape, neg_int.shape)
np.save(f'../dataset/{pos_name}_int.npy', pos_int)
np.save(f'../dataset/{neg_name}_int.npy', neg_int)
# print(unknown_words, len(unknown_words))
pickle.dump(unknown_words, open(f'../dataset/unknown.pkl', 'wb'))
             if line[3] not in datas.keys():
                 datas[line[3]] = []
             datas[line[3]].append(line[0])
         line = f.readline().split(",")
 for d in datas:
     print("TYPE:" + str(d))
     temp = ""
     for i, n in enumerate(datas[d]):
         temp = temp + n + ","
         if i % 10 == 0 and i != 0:
             print(temp + "\n")
             temp = ""
     print(temp + "\n")
 # タイプごとに名前を分類できたのでとりあえず保存しておく
 with open("type_names_dict.dill", "wb") as f:
     dill.dump(datas, f)
 with open("type_names_dict.dill", "rb") as f:
     datas = dill.load(f)
 # 各タイプのモデルを生成してみよう
 phist = PHist()
 phist.fit(datas)
 # どの程度正解するか見てみよう
 with open("result/result" + str(int(datetime.now().timestamp())) + ".txt",
           "w",
           encoding="utf-8") as res:
     with open("pokemon.csv", "r", encoding="utf-8") as f:
         line = f.readline().split(",")
         count = 0
         success_fir = 0
         success_sec = 0
         success_rev = 0
Exemple #53
0
    for run in range(config.n_runs):
        ## use 33% for training and 67 % for validation
        ## so we switch trainInd and validInd
        for fold, (validInd, trainInd) in enumerate(skf[run]):
            print("Run: %d, Fold: %d" % (run + 1, fold + 1))
            path = "%s/Run%d/Fold%d" % (config.feat_folder, run + 1, fold + 1)

            #########################
            ## get word count feat ##
            #########################
            for feat_name in feat_names:
                X_train = dfTrain[feat_name].values[trainInd]
                X_valid = dfTrain[feat_name].values[validInd]
                with open("%s/train.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    dill.dump(X_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    dill.dump(X_valid, f, -1)
    print("Done.")

    # print("For training and testing...")
    # path = "%s/All" % config.feat_folder
    # ## use full version for X_train
    # extract_feat(dfTest)
    # for feat_name in feat_names:
    #     X_train = dfTrain[feat_name].values
    #     X_test = dfTest[feat_name].values
    #     with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
    #         dill.dump(X_train, f, -1)
    #     with open("%s/test.%s.feat.pkl" % (path, feat_name), "wb") as f:
Exemple #54
0
def main():
    import sys
    import os
    from datetime import datetime
    from joblib import Parallel, delayed
    import tempfile
    import dill
    from pynets.stats.utils import make_subject_dict, cleanNullTerms, \
        get_ensembles_top, get_ensembles_embedding, \
        build_grid
    from colorama import Fore, Style
    try:
        import pynets
    except ImportError:
        print(
            "PyNets not installed! Ensure that you are referencing the correct"
            " site-packages and using Python3.6+")

    if len(sys.argv) < 1:
        print("\nMissing command-line inputs! See help options with the -h"
              " flag.\n")
        sys.exit(1)

    # Parse inputs
    base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/triple'
    #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/outputs_language'
    thr_type = "MST"
    icc = True
    disc = False
    int_consist = False
    modality = 'dwi'

    embedding_types = ['OMNI']
    #rsns = ['language']
    rsns = ['kmeans', 'triple']
    template = 'CN200'
    # template = 'MNI152_T1'
    mets = [
        "global_efficiency", "average_shortest_path_length",
        "degree_assortativity_coefficient", "average_betweenness_centrality",
        "average_eigenvector_centrality", "smallworldness", "modularity"
    ]

    metaparams_func = ["rsn", "res", "model", 'hpass', 'extract', 'smooth']
    metaparams_dwi = ["rsn", "res", "model", 'directget', 'minlength', 'tol']

    sessions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    ####

    print(f"{Fore.LIGHTBLUE_EX}\nBenchmarking API\n")

    print(Style.RESET_ALL)

    print(f"{Fore.LIGHTGREEN_EX}Gathering sampled data...")

    print(Style.RESET_ALL)

    for embedding_type in embedding_types:
        subject_dict_file_path = (f"{base_dir}/pynets_subject_dict_{modality}_"
                                  f"{embedding_type}_{template}.pkl")
        subject_mod_grids_file_path = (
            f"{base_dir}/pynets_modality_grids_{modality}_"
            f"{embedding_type}_{template}.pkl")
        missingness_summary = (
            f"{base_dir}/pynets_missingness_summary_{modality}_"
            f"{embedding_type}_{template}.csv")
        icc_tmps_dir = f"{base_dir}/icc_tmps/{modality}_" \
                       f"{embedding_type}"
        os.makedirs(icc_tmps_dir, exist_ok=True)
        if not os.path.isfile(subject_dict_file_path):
            subject_dict, modality_grids, missingness_frames = \
                make_subject_dict(
                    [modality], base_dir, thr_type, mets, [embedding_type],
                    template, sessions, rsns
                )
            sub_dict_clean = cleanNullTerms(subject_dict)
            missingness_frames = [
                i for i in missingness_frames if isinstance(i, pd.DataFrame)
            ]
            if len(missingness_frames) != 0:
                if len(missingness_frames) > 0:
                    if len(missingness_frames) > 1:
                        final_missingness_summary = pd.concat(
                            missingness_frames)
                        final_missingness_summary.to_csv(missingness_summary,
                                                         index=False)
                        final_missingness_summary.id = \
                            final_missingness_summary.id.astype(
                                'str').str.split('_', expand=True)[0]
                    elif len(missingness_frames) == 1:
                        final_missingness_summary = missingness_frames[0]
                        final_missingness_summary.to_csv(missingness_summary,
                                                         index=False)
                        final_missingness_summary.id = \
                            final_missingness_summary.id.astype(
                                'str').str.split('_', expand=True)[0]
                    else:
                        final_missingness_summary = pd.Series()
                else:
                    final_missingness_summary = pd.Series()
            else:
                final_missingness_summary = pd.Series()
            with open(subject_dict_file_path, "wb") as f:
                dill.dump(sub_dict_clean, f)
            f.close()
            with open(subject_mod_grids_file_path, "wb") as f:
                dill.dump(modality_grids, f)
            f.close()
        else:
            with open(subject_dict_file_path, 'rb') as f:
                sub_dict_clean = dill.load(f)
            f.close()
            with open(subject_mod_grids_file_path, "rb") as f:
                modality_grids = dill.load(f)
            f.close()
            if os.path.isfile(missingness_summary):
                final_missingness_summary = pd.read_csv(missingness_summary)
                final_missingness_summary.id = \
                    final_missingness_summary.id.astype('str').str.split(
                        '_', expand=True)[0]
            else:
                final_missingness_summary = pd.Series()
        ids = sub_dict_clean.keys()

        # print(f"MODALITY: {modality}")
        metaparams = eval(f"metaparams_{modality}")
        metaparam_dict = {}

        # print(f"EMBEDDING TYPE: {embedding_type}")
        # if os.path.isfile(f"{base_dir}/grid_clean_{modality}_{alg}.csv"):
        #     continue

        if embedding_type == 'topology':
            ensembles, df_top = get_ensembles_top(modality, thr_type,
                                                  f"{base_dir}/pynets")
        else:
            ensembles = get_ensembles_embedding(modality, embedding_type,
                                                base_dir)
        grid = build_grid(modality, metaparam_dict,
                          sorted(list(set(metaparams))), ensembles)[1]

        grid = [
            i for i in grid if '200' not in i and '400' not in i
            and '600' not in i and '800' not in i and 'triple' not in i
        ]

        good_grids = []
        for grid_param in grid:
            grid_finds = []
            for ID in ids:
                if ID not in sub_dict_clean.keys():
                    print(f"ID: {ID} not found...")
                    continue

                if str(sessions[0]) not in sub_dict_clean[ID].keys():
                    print(f"Session: {sessions[0]} not found for ID {ID}...")
                    continue

                if modality not in sub_dict_clean[ID][str(sessions[0])].keys():
                    print(f"Modality: {modality} not found for ID {ID}, "
                          f"ses-{sessions[0]}...")
                    continue

                if embedding_type not in \
                    sub_dict_clean[ID][str(sessions[0])][modality].keys():
                    print(f"Modality: {modality} not found for ID {ID}, "
                          f"ses-{sessions[0]}, {embedding_type}...")
                    continue

                if grid_param in \
                    list(sub_dict_clean[ID][str(sessions[0])][modality][
                             embedding_type].keys()):
                    grid_finds.append(grid_param)
            if len(grid_finds) < 0.75 * len(ids):
                print(f"Less than 75% of {grid_param} found. Removing from "
                      f"grid...")
                continue
            else:
                good_grids.append(grid_param)

        modality_grids[modality] = good_grids

        cache_dir = tempfile.mkdtemp()

        with Parallel(n_jobs=-1,
                      require="sharedmem",
                      backend='threading',
                      verbose=10,
                      max_nbytes='200000M',
                      temp_folder=cache_dir) as parallel:
            outs = parallel(
                delayed(benchmark_reproducibility)
                (base_dir, comb, modality, embedding_type, sub_dict_clean,
                 disc, final_missingness_summary, icc_tmps_dir, icc, mets, ids,
                 template) for comb in grid)
        # outs = []
        # for comb in grid:
        #     outs.append(benchmark_reproducibility(base_dir, comb, modality,
        #     embedding_type, sub_dict_clean,
        #             disc, final_missingness_summary, icc_tmps_dir, icc,
        #             mets, ids))

        df_summary = pd.concat(
            [i for i in outs if i is not None and not i.empty], axis=0)
        df_summary = df_summary.dropna(axis=0, how='all')
        print(f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_"
              f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...")
        df_summary.to_csv(
            f"{base_dir}"
            f"/grid_clean_{modality}_{embedding_type}_"
            f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}"
            f".csv",
            index=False)

        # int_consist
        if int_consist is True and embedding_type == 'topology':
            try:
                import pingouin as pg
            except ImportError:
                print("Cannot evaluate test-retest int_consist. pingouin"
                      " must be installed!")

            df_summary_cronbach = pd.DataFrame(
                columns=['modality', 'embedding', 'cronbach'])
            df_summary_cronbach.at[0, "modality"] = modality
            df_summary_cronbach.at[0, "embedding"] = embedding_type

            for met in mets:
                cronbach_ses_list = []
                for ses in range(1, 10):
                    id_dict = {}
                    for ID in ids:
                        id_dict[ID] = {}
                        for comb in grid:
                            if modality == 'func':
                                try:
                                    extract, hpass, model, res, atlas, \
                                        smooth = comb
                                except BaseException:
                                    print(f"Missing {comb}...")
                                    extract, hpass, model, res, atlas = comb
                                    smooth = '0'
                                comb_tuple = (atlas, extract, hpass, model,
                                              res, smooth)
                            else:
                                directget, minlength, model, res, atlas, \
                                    tol = comb
                                comb_tuple = (atlas, directget, minlength,
                                              model, res, tol)
                            if comb_tuple in sub_dict_clean[ID][str(
                                    ses)][modality][embedding_type].keys():
                                if isinstance(
                                        sub_dict_clean[ID][str(ses)][modality]
                                    [embedding_type][comb_tuple], np.ndarray):
                                    id_dict[ID][comb] = sub_dict_clean[ID][str(
                                        ses)][modality][embedding_type][
                                            comb_tuple][mets.index(met)][0]
                                else:
                                    continue
                            else:
                                continue
                    df_wide = pd.DataFrame(id_dict)
                    if df_wide.empty is True:
                        continue
                    else:
                        df_wide = df_wide.add_prefix(f"{met}_comb_")
                        df_wide.replace(0, np.nan, inplace=True)
                        print(df_wide)
                    try:
                        c_alpha = pg.cronbach_alpha(data=df_wide.dropna(
                            axis=1, how='all'),
                                                    nan_policy='listwise')
                        cronbach_ses_list.append(c_alpha[0])
                    except BaseException:
                        print('FAILED...')
                        print(df_wide)
                        del df_wide
                    del df_wide
                df_summary_cronbach.at[0, f"average_cronbach_{met}"] = \
                    np.nanmean(cronbach_ses_list)
            print(f"Saving to {base_dir}/grid_clean_{modality}_"
                  f"{embedding_type}_cronbach_"
                  f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...")
            df_summary_cronbach.to_csv(
                f"{base_dir}/grid_clean_{modality}_"
                f"{embedding_type}_cronbach"
                f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}"
                f".csv",
                index=False)

    return
Exemple #55
0
                        help='for multiple scales, eg. [1.0, (1.1, 0.05)]')
    args = parser.parse_args()
    scales = ast.literal_eval(args.scales)

    w, h = model_wh(args.resolution)
    e = TfPoseEstimator(get_graph_path(args.model), target_size=(w, h))

    files_grabbed = glob.glob(os.path.join(args.folder, '*.jpg'))
    all_humans = dict()
    for i, file in enumerate(files_grabbed):
        # estimate human poses from a single image !
        image = common.read_imgfile(file, None, None)
        t = time.time()
        #humans = e.inference(image, upsample_size=scales)
        humans = e.inference(image)

        elapsed = time.time() - t

        logger.info('inference image #%d: %s in %.4f seconds.' %
                    (i, file, elapsed))

        image = TfPoseEstimator.draw_humans(image, humans, imgcopy=False)
        cv2.imshow('tf-pose-estimation result', image)
        cv2.waitKey(5)

        all_humans[file.replace(args.folder, '')] = humans

    with open(os.path.join(args.folder, 'pose.dil'), 'wb') as f:
        dill.dump(all_humans, f, protocol=dill.HIGHEST_PROTOCOL)
        print(all_humans)
Exemple #56
0
df = pd.DataFrame(df)
y = df['stars'].tolist()

af = pipeline.FeatureUnion([
    ('rest', EstTransformer()),
    ('rshe', SheTransformer()),
    ('rcat', CatTransformer()),
    ('ratt', AttTransformer()),
])

all_pipe = pipeline.Pipeline([
    ('features', af),
    ('lasso', linear_model.LinearRegression(fit_intercept=True))
])

print('fitting')
all_pipe.fit(df, y)
print('fitting complete')

with open('../pickle/all.dill', "wb") as f:
    dill.dump(all_pipe, f)

test = df.sample()

test_dict = [test.attributes]

pdb.set_trace()

#print (att_pipe.predict(test_dict))
#print (test['stars'])
Exemple #57
0
from app.preprocessing.preprocess import Preprocess
import dill

preprocessed = Preprocess()
dill.dump(preprocessed, open('preprocessed.p', 'wb'))
Exemple #58
0
    batch_end = (i + 2) * batch_size
    batch_end = np.min([batch_end, X.shape[0]])

    X_batch = X[batch_st:batch_end, :]
    y_batch = y[batch_st:batch_end, :]

    for model in range(len(models)):
        lst_actions[model] = simulate_rounds_stoch(models[model],
                                                   lst_rewards[model],
                                                   lst_actions[model],
                                                   X_batch,
                                                   y_batch,
                                                   rnd_seed=batch_st)

    for model in range(len(models)):
        dill.dump(models[model], open("model_%d_loc7.dill" % (model), "wb"))

#plotting

import matplotlib.pyplot as plt
from pylab import rcParams


def get_mean_reward(reward_lst, batch_size=batch_size):
    mean_rew = list()
    for r in range(len(reward_lst)):
        mean_rew.append(sum(reward_lst[:r + 1]) * 1.0 / ((r + 1) * batch_size))
    return mean_rew


import scipy.stats as st
Exemple #59
0
    def _read(self, file_path: str):
        if not file_path.endswith('.json'):
            raise ConfigurationError(
                f"Don't know how to read filetype of {file_path}")

        cache_dir = os.path.join('cache', file_path.split("/")[-1])

        if self._load_cache:
            logger.info(f'Trying to load cache from {cache_dir}')
        if self._save_cache:
            os.makedirs(cache_dir, exist_ok=True)

        cnt = 0
        with open(file_path, "r") as data_file:
            json_obj = json.load(data_file)
            for total_cnt, ex in enumerate(json_obj):
                cache_filename = f'instance-{total_cnt}.pt'
                cache_filepath = os.path.join(cache_dir, cache_filename)
                if self._loading_limit == cnt:
                    break

                if self._load_cache:
                    try:
                        ins = dill.load(open(cache_filepath, 'rb'))
                        if ins is None and not self._keep_if_unparsable:
                            # skip unparsed examples
                            continue
                        yield ins
                        cnt += 1
                        continue
                    except Exception as e:
                        # could not load from cache - keep loading without cache
                        pass

                query_tokens = None
                if 'query_toks' in ex:
                    # we only have 'query_toks' in example for training/dev sets

                    # fix for examples: we want to use the 'query_toks_no_value' field of the example which anonymizes
                    # values. However, it also anonymizes numbers (e.g. LIMIT 3 -> LIMIT 'value', which is not good
                    # since the official evaluator does expect a number and not a value
                    ex = fix_number_value(ex)

                    # we want the query tokens to be non-ambiguous (i.e. know for each column the table it belongs to,
                    # and for each table alias its explicit name)
                    # we thus remove all aliases and make changes such as:
                    # 'name' -> 'singer@name',
                    # 'singer AS T1' -> 'singer',
                    # 'T1.name' -> 'singer@name'
                    try:
                        query_tokens = disambiguate_items(
                            ex['db_id'],
                            ex['query_toks_no_value'],
                            self._tables_file,
                            allow_aliases=False)
                    except Exception as e:
                        # there are two examples in the train set that are wrongly formatted, skip them
                        print(f"error with {ex['query']}")
                        print(e)

                ins = self.text_to_instance(utterance=ex['question'],
                                            db_id=ex['db_id'],
                                            sql=query_tokens)
                if ins is not None:
                    cnt += 1
                if self._save_cache:
                    dill.dump(ins, open(cache_filepath, 'wb'))

                if ins is not None:
                    yield ins
Exemple #60
0
    return l


crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=500,
                           all_possible_transitions=True,
                           model_filename=file_name + "-pos-new.model2")
crf.fit(X, y)

labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
e = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print(e)
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(
    metrics.flat_classification_report(y_test,
                                       y_pred,
                                       labels=sorted_labels,
                                       digits=3))
# cross_validate
f1_scorer = make_scorer(metrics.flat_f1_score, average='macro')

scores = cross_validate(crf, X, y, scoring=f1_scorer, cv=5)
# save data
import dill
with open("datatrain.data", "wb") as dill_file:
    dill.dump(datatofile, dill_file)