def extract_dialogues(filename, pkl_filename, restaurant_db): """ Extract dialogues from given filename as list of lists :param filename: :return: """ dialogues = [] # Create DB if not os.path.exists(restaurant_db): conn = sqlite3.connect(restaurant_db) c = conn.cursor() print "Creating DB" c.execute("""CREATE TABLE Restaurants (name text unique, post_code text, cuisine text, location text, phone text, address text, price text, rating text)""") conn.commit() else: conn = sqlite3.connect(restaurant_db) c = conn.cursor() with open(filename, "r") as f: exchanges = [] # (Post_code, cuisine, location, phone, address, price, rating) api_results = [] for line in f: # Signifies that end of dialogue has been reached so # output utterances if line == "\n": dialogues.append(exchanges) restaurants = process_api_results(api_results) # Update restaurants in DB if len(restaurants) != 0: for r in restaurants: c.execute("INSERT OR IGNORE INTO Restaurants VALUES " "(?,?,?,?,?,?,?,?)", r) conn.commit() exchanges = [] api_results = [] continue contents = line.strip().split("\t") if len(contents) == 1: clean_contents = " ".join(contents[0].strip().split(" ")[1:]) if clean_contents != "" and clean_contents != "api_call no result": api_results.append(clean_contents) else: user, system = contents[0], contents[1] user = "******".join(user.split(" ")[1:]) exchanges.append((user, system)) print "Dialogues: ", len(dialogues) with open(pkl_filename, "wb") as f: pickle.dump(dialogues, f)
def encrypt(key, path="message.txt", saveCT="ciphertext.enc"): b = random.randrange(2, (key.p)-1) u = modexp(key.g, b, key.p) v = modexp(key.h, b, key.p) uv = str(u)+str(v) k = SHA224.new(uv.encode('utf-8')).hexdigest().encode('utf-8') #symmetric key for compute the ciphertext with AES print("K: "+str(k)) # Open file plaintext to cipher plaintext = open(path,"rb").read() #plaintext = encode(plaintext, key.iNumBits) bs = Blowfish.block_size iv = Random.new().read(bs) cipher = Blowfish.new(k, Blowfish.MODE_CBC, iv) plen = bs - divmod(len(plaintext),bs)[1] padding = [plen]*plen padding = struct.pack('b'*plen,*padding) ciphertext = iv + cipher.encrypt(plaintext+padding) # Save ciphertext to file: print("CT-LEN:"+str(len(ciphertext))) with open(saveCT, 'wb') as output: dill.dump(u, output) dill.dump(ciphertext, output) return plaintext, ciphertext
def grid_search(X, y): ''' cross validated grid search using Ridge Regressor and Random Forest Regressor ''' nids = df_subset.index titles = df_subset['title'] pars = {'alpha': [0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02]} gs = GridSearchCV(Ridge(), pars, cv=5) gs.fit(X, y) ridge = gs.best_estimator_ dill.dump(ridge, open('ridge.pkl', 'wb')) pars = {'max_depth': [5, 8, 10, 20, 50, 100], 'min_samples_split': [2, 3, 5, 10, 20]} gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2), pars, cv=5) rfr = gs.best_estimator_ dill.dump(rfr, open('rfr.pkl', 'wb')) return ridge, rfr
def save(self, experiment_dir): """ Saves the current model and related training parameters into a subdirectory of the checkpoint directory. The name of the subdirectory is the current local time in Y_M_D_H_M_S format. Args: experiment_dir (str): path to the experiment root directory Returns: str: path to the saved checkpoint subdirectory """ date_time = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime()) self._path = os.path.join(experiment_dir, self.CHECKPOINT_DIR_NAME, date_time) path = self._path if os.path.exists(path): shutil.rmtree(path) os.makedirs(path) torch.save({'epoch': self.epoch, 'step': self.step, 'optimizer': self.optimizer }, os.path.join(path, self.TRAINER_STATE_NAME)) torch.save(self.model, os.path.join(path, self.MODEL_NAME)) with open(os.path.join(path, self.INPUT_VOCAB_FILE), 'wb') as fout: dill.dump(self.input_vocab, fout) with open(os.path.join(path, self.OUTPUT_VOCAB_FILE), 'wb') as fout: dill.dump(self.output_vocab, fout) return path
def dill_save(obj, name, folder='pc'): """This scripts saves any kind of object as a dill file in a folder. Args: obj: object you want to save in an pkl file name: name of pkl file, '.pkl' will be added automatically if missing """ from pencilnew.io.mkdir import mkdir from os import remove from os.path import join, exists import dill mkdir(folder) ## prepare folder if (not name.endswith('.dill')): name = name+'.dill' if folder=='pc' and name.startswith('pc/'): name=name[3:] full_path = join(folder, name) if exists(full_path): remove(full_path) with open(join(folder, name), 'wb') as f: dill.dump(obj, f) return True
def save_dump(filename, tb=None): """ Saves a Python traceback in a pickled file. This function will usually be called from an except block to allow post-mortem debugging of a failed process. The saved file can be loaded with load_dump which creates a fake traceback object that can be passed to any reasonable Python debugger. The simplest way to do that is to run: $ pydump.py my_dump_file.dump """ if not tb: tb = sys.exc_info()[2] fake_tb = FakeTraceback(tb) _remove_builtins(fake_tb) dump = { "traceback": fake_tb, "files": _get_traceback_files(fake_tb), "dump_version": DUMP_VERSION, } with gzip.open(filename, "wb") as f: if dill is not None: dill.dump(dump, f) else: pickle.dump(dump, f, protocol=pickle.HIGHEST_PROTOCOL)
def save_session(fname=None, session=None, pickleProto=4): import dill as pickle if fname is None: fname = conf.session if not fname: conf.session = fname = utils.get_temp_file(keep=True) log_interactive.info("Use [%s] as session file" % fname) if session is None: session = builtins.__dict__["kamene_session"] to_be_saved = session.copy() for k in list(to_be_saved.keys()): if k in ["__builtins__", "In", "Out", "conf"] or k.startswith("_") or \ (hasattr(to_be_saved[k], "__module__") and str(to_be_saved[k].__module__).startswith('IPython')): del(to_be_saved[k]) continue if type(to_be_saved[k]) in [type, types.ModuleType, types.MethodType]: log_interactive.info("[%s] (%s) can't be saved." % (k, type(to_be_saved[k]))) del(to_be_saved[k]) try: os.rename(fname, fname+".bak") except OSError: pass f=gzip.open(fname,"wb") for i in to_be_saved.keys(): #d = {i: to_be_saved[i]} #pickle.dump(d, f, pickleProto) pickle.dump(to_be_saved, f, pickleProto) f.close()
def qsubwrap(fun, *args, pickle_dir=None, messages_dir=None, queue=default_queue, extra_options='', verbose=False, **kwargs): """Submit a qsub job to call fun(*args, **kwargs) pickledir will be used to exchange input/output via pickles. Defaults to os.cwd() / qsub_pickles messages_dir will contain the files with stdout and stderr from the jobs. Defaults to os.cwd() / qsub_messages queue: name of the queue qsub will submit the job to extra_options will be passes directly to the qsub command. returns: job_name, filename of pickle which will contain result of function when the job is done """ job_name = fun.__name__ + '%09d' % randint(0, 1e9) if verbose: print("Starting submission of job %s" % job_name) if pickle_dir is None: pickle_dir = os.path.join(os.getcwd(), 'qsub_pickles') if not os.path.exists(pickle_dir): os.makedirs(pickle_dir) if messages_dir is None: messages_dir = os.path.join(os.getcwd(), 'qsub_messages') if not os.path.exists(messages_dir): os.makedirs(messages_dir) extra_options += ' -e localhost:{messages_dir} -o localhost:{messages_dir}'.format(messages_dir=messages_dir) input_pickle_name = os.path.join(pickle_dir, strftime('input_%Y%m%d_%H%M%S_') + job_name + '.pickle') output_pickle_name = os.path.join(pickle_dir, strftime('output_%Y%m%d_%H%M%S_') + job_name + '.pickle') if verbose: print("Writing input pickle for job %s" % job_name) with open(input_pickle_name, 'wb') as input_pickle: pickle.dump(dict(fun=fun, fun_args=args, fun_kwargs=kwargs), input_pickle) # Make the python script for this job py_script = tempfile.NamedTemporaryFile(suffix=".py", delete=False, mode='w') if verbose: print("Making py script pickle for job %s: %s" % (job_name, py_script.name)) py_script.write(py_script_template.format(input_pickle_name=input_pickle_name, output_pickle_name=output_pickle_name, python_path=sys.executable)) py_script.close() make_executable(py_script.name) # Submit the script to qsub # Now we'll learn our actual jobname from qsub, which we return cmd = submission_command_template.format(queue=queue, script_name=py_script.name, messages_dir=messages_dir, extra_options=extra_options, job_name=job_name) if verbose: print("qsub command for job %s: %s" % (job_name, cmd)) job_name = subprocess.check_output(cmd, shell=True).decode('utf-8').rstrip() return job_name, output_pickle_name
def dump(object, **kwds): """dill.dump of object to a NamedTemporaryFile. Loads with "dill.temp.load". Returns the filehandle. >>> dumpfile = dill.temp.dump([1, 2, 3, 4, 5]) >>> dill.temp.load(dumpfile) [1, 2, 3, 4, 5] Optional kwds: If 'suffix' is specified, the file name will end with that suffix, otherwise there will be no suffix. If 'prefix' is specified, the file name will begin with that prefix, otherwise a default prefix is used. If 'dir' is specified, the file will be created in that directory, otherwise a default directory is used. If 'text' is specified and true, the file is opened in text mode. Else (the default) the file is opened in binary mode. On some operating systems, this makes no difference. NOTE: Keep the return value for as long as you want your file to exist ! """ import dill as pickle import tempfile file = tempfile.NamedTemporaryFile(**kwds) pickle.dump(object, file) file.flush() return file
def build_regression(transformer, limit=1000000, db_name=db.DB_NAME): conn=sqlite3.connect(db_name) c=conn.cursor() c.execute("select * from match_data limit "+str(limit)) alldata=c.fetchall() train_data1=map(itemgetter(slice(1,6)),alldata) train_data2=map(itemgetter(slice(6,11)),alldata) train_wins=map(itemgetter(11), alldata) del alldata params=[(i,j) for i in range(0,6) for j in range(0,6) if i+j>=4 and i<=j] clf={k:LogisticRegression(C=20./(k[0]+k[1])**4) for k in params} for (i,j) in params: train1=[] train2=[] for row_num in range(len(train_data1)): train1.append(sorted(random.sample(train_data1[row_num],i))) train2.append(sorted(random.sample(train_data2[row_num],j))) clf[(i,j)].fit(map(matrixitemgetter(0),transformer.transform(train1,train2)),train_wins) print str(i)+","+str(j) with open("clf_all-"+str(int(time())) +".dill", "w+") as f: dill.dump(clf, f) with open("transf-"+str(int(time())) +".dill", "w+") as f: dill.dump(transformer, f) print 'transformer and classifier saved'
def save(self, filename): #save tf graph save_path = self.saver.save(self.session, filename); # save class pareter filename_class = filename + '.pickle'; pickle.dump([self.model, self.images], open(filename_class, 'wb')); print("WormVision saved to files: %s, %s" % (save_path, filename_class));
def save(self, filename): """ Save model to pickle file. External feature function is not stored """ import dill tmpmodelparams = self.modelparams.copy() # fv_extern_src = None fv_extern_name = None # try: # fv_extern_src = dill.source.getsource(tmpmodelparams['fv_extern']) # tmpmodelparams.pop('fv_extern') # except: # pass # fv_extern_name = dill.source.getname(tmpmodelparams['fv_extern']) if "fv_extern" in tmpmodelparams: tmpmodelparams.pop("fv_extern") sv = { "modelparams": tmpmodelparams, "mdl": self.mdl, # 'fv_extern_src': fv_extern_src, # 'fv_extern_src_name': fv_extern_src_name, # 'fv_extern_name': fv_extern_src_name, # } sss = dill.dumps(self.modelparams) logger.debug("pickled " + str(sss)) dill.dump(sv, open(filename, "wb"))
def main(): data = pickle.load(open("reuters_raw.pickle", "rb")) corpus = list() titles = list() topics = list() days = list() doc_grp_id = list() start_date = data[0][0]["date"][0].split("-")[:-1] cumulative_months, Months_lists = get_months() j = 1 for article in data: if "text" in article[0] and "topics" in article[0]: topic = article[0]["topics"] if len(topic) != 1: continue for piece in article[0]["text"]: if isinstance(piece, dict) and "body" in piece: # print "hell" titles.append(piece["title"][0]) corpus.append("".join(piece["body"])) topics.append(topic[0]["d"][0]) date = article[0]["date"][0].split("-")[:-1] days.append(date_convert(date, start_date, cumulative_months, Months_lists)) doc_grp_id.append(j) j += 1 print len(corpus) pickle.dump((corpus, topics, titles, days, doc_grp_id), open("reuters.pickle", "wb"))
def createFileList(self): """SRTM data is split into different directories, get a list of all of them and create a dictionary for easy lookup.""" if self.protocol == "ftp": ftp = ftplib.FTP(self.server) try: ftp.login() ftp.cwd(self.directory) continents = ftp.nlst() for continent in continents: print "Downloading file list for", continent ftp.cwd(self.directory+"/"+continent) files = ftp.nlst() for filename in files: self.filelist[self.parseFilename(filename)] = ( continent, filename) finally: ftp.close() # Add meta info self.filelist["server"] = self.server self.filelist["directory"] = self.directory with open(self.filelist_file , 'wb') as output: pickle.dump(self.filelist, output) else: self.createFileListHTTP()
def do_evaluation(LAss, opt, X, Z, TX, TZ, test_labels, train_labels, counter): """ Evaluates opt on a certain parameter set. Parameters ---------- LAss : ValidationLabAssistant The LAss to use for updates. opt : string The name of the experiment to use here. X, Z : matrix Feature and Target matrices of the training set, one-hot encoded. VX, VZ : matrix Feature and Target matrices of the validation set, one-hot encoded. """ to_eval = LAss.get_next_candidate(opt) step_rate = to_eval.params["step_rate"] momentum = to_eval.params["momentum"] decay = to_eval.params["decay"] c_wd = to_eval.params["c_wd"] print opt,step_rate, momentum, decay, c_wd with open("apsis_pars_"+opt+str(counter)+".pkl", 'wb') as fp: dill.dump((LAss, opt, step_rate, momentum, decay, c_wd, counter, 0, 0), fp) result, n_iter = do_one_eval(X, Z, TX, TZ,test_labels, train_labels, step_rate, momentum, decay, c_wd, counter, opt) to_eval.result = result LAss.update(opt, to_eval) with open("apsis_pars_"+opt+str(counter)+".pkl", 'wb') as fp: dill.dump((LAss, opt, step_rate, momentum, decay, c_wd, counter, n_iter, result), fp)
def create_reverse_index(self, documents_filename, common_words_filename): # Load reverse index if already exists, create it (and save it) otherwise. reverse_index_file = self.save_folder_path + self.ponderation_name + '.rev' if os.path.isfile(reverse_index_file): print 'Loading reverse index...', with open(reverse_index_file, 'rb') as in_strm: reverse_index = dill.load(in_strm) print 'done' else: print 'Loading raw documents...', # Parse the documents Parser = Parse_cacm('sources/cacm.all', 'sources/common_words') index = Parser.parse_file() print 'done' print 'Creating reverse index...', reverse_index = self.ponderation_method(index) reverse_index.other_infos['ponderation_method'] = self.ponderation_name reverse_index.other_infos['number_of_documents'] = len(index) with open(reverse_index_file, 'wb') as output: dill.dump(reverse_index, output, dill.HIGHEST_PROTOCOL) print 'done' return reverse_index
def __init__(self, process_obj): # create pipe for communication with child r, w = os.pipe() # get handle for read end of the pipe and make it inheritable rhandle = msvcrt.get_osfhandle(r) win32.SetHandleInformation(rhandle, win32.HANDLE_FLAG_INHERIT, win32.HANDLE_FLAG_INHERIT) # start process cmd = getCommandLine() + [rhandle] cmd = " ".join('"%s"' % x for x in cmd) hp, ht, pid, tid = _subprocess.CreateProcess(sys.executable, cmd, None, None, 1, 0, None, None, None) os.close(r) ht.Close() # set attributes of self self.pid = pid self.returncode = None self._handle = hp # send information to child prep_data = getPreparationData(process_obj._name) to_child = os.fdopen(w, "wb") tls.is_spawning = True try: dump(prep_data, to_child, HIGHEST_PROTOCOL) dump(process_obj, to_child, HIGHEST_PROTOCOL) finally: tls.is_spawning = False to_child.close()
def write(obj, filename): os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__)))) filename = os.path.join(os.getcwd(), filename) with open(filename, 'wb') as output_: dill.dump(obj, output_) print "Object saved to file: ", filename
def dump_trials(trials, path): print('Size of object: ' + str(len(trials))) print('Dumping') file = open(path, 'wb') cPickle.dump(trials, file) file.close()
def hotstart(self, FU, model, tTransformer1, tTransformer2=None, m_root='../models'): """Aggregate hotstart chunk & train FeatureUnion and Model. Requires pre-fit target transformer""" self.m_root = m_root print 'Extracting...' train = [] for i in self.hotstart_idx: filename = self.chunklist[i] zf = zipfile.ZipFile('{}/{}'.format(self.d_root,filename)) train += json.loads(zf.read(zf.namelist()[0]))['results'] self.total_records += len(train) print 'Transforming Target...' # grab hotstart target self.tTransformer1 = tTransformer1 self.tTransformer2 = tTransformer2 if tTransformer2 is not None: target = self.tTransformer1.transform(train) target = self.tTransformer2.transform(target) else: target = self.tTransformer1.transform(train) print 'Extracting Features...' # fit, then transform features self.FU_fitted = FU.fit(train,0) features = self.FU_fitted.transform(train) # initialize and fit the model print "Modeling..." self.model = model self.model.fit(features, target) # pickle-it with open('{}/{}_hotstart.pkl'.format(m_root,self.modelname), 'wb') as output: pickle.dump(self.model, output) print 'Done! {} records processed'.format(self.total_records) print 'Pickled as {}_hotstart'.format(self.modelname)
def persist(self): pkl_file_path = os.path.join(self.store_dir, "test_stored_env.pkl") dill.settings['byref'] = True with open(pkl_file_path, 'w+') as f: dill.detect.trace(True) dill.dump(self, f)
def save_default_values(dsp, path): """ Write Dispatcher default values in Python pickle format. Pickles are a serialized byte stream of a Python object. This format will preserve Python objects used as nodes or edges. :param dsp: A dispatcher that identifies the model adopted. :type dsp: schedula.Dispatcher :param path: File or filename to write. File names ending in .gz or .bz2 will be compressed. :type path: str, file .. testsetup:: >>> from tempfile import mkstemp >>> file_name = mkstemp()[1] Example:: >>> from schedula import Dispatcher >>> dsp = Dispatcher() >>> dsp.add_data('a', default_value=1) 'a' >>> dsp.add_function(function=max, inputs=['a', 'b'], outputs=['c']) 'max' >>> save_default_values(dsp, file_name) """ import dill with open(path, 'wb') as f: dill.dump(dsp.default_values, f)
def send_tasks(server, futures_chunk): print('Sending tasks to %s' % server) with open('%s_tasks.pkl' % server, 'wb') as taskfile: pickle.dump(futures_chunk, taskfile) # Copy tasks file to server subprocess.call(['scp', '%s_tasks.pkl' % server, '%s:' % server]) os.remove('%s_tasks.pkl' % server)
def __init__(self, args, siteInfo): self.args = args self.ontology = Ontology() self.histMark = siteInfo.histMark self.assayType = siteInfo.assayType def dd(): return defaultdict(None) self.byAssemblyAssays = defaultdict(dd) allAssays = ["BothDNaseAnd" + self.histMark, self.histMark, "DNase"] print("WebEpigenomesLoader:", self.histMark, self.assayType) if 0: self._generate(allAssays) fn = "webEpigenomesLoader.byAssemblyAssays.{histMark}.dill".format(histMark=self.histMark) # outFnp = "/data/projects/encode/encyclopedia_v3/ outFnp = os.path.join(os.path.dirname(__file__), "../../../", fn) with open(outFnp, 'wb') as f: dill.dump(self.byAssemblyAssays, f) print("wrote", outFnp) else: fn = "webEpigenomesLoader.byAssemblyAssays.{histMark}.dill".format(histMark=self.histMark) # outFnp = "/data/projects/encode/encyclopedia_v3/ outFnp = os.path.join(os.path.dirname(__file__), "../../../", fn) with open(outFnp, 'rb') as f: self.byAssemblyAssays = dill.load(f) print("read", outFnp)
def sub(): while(1): X, data_cor_cid = main() if( data_cor_cid > 50): pickle.dump(X, open('Y.pickle', 'wb')) print X, data_cor_cid break
def mh_wrapper(pc, expt, data_params, inference_params, return_stats=False, results_folder='./', log_folder='./', specified_start=None): print(datetime.datetime.now()) print_save_params(data_params, inference_params, results_folder) data = wrapper.get_data(pc, expt, data_params) data = wrapper.process_data(data, inference_params) 1/0 if inference_params.meta_noise_type == "beta": data['metas'] = [np.clip(m, .01, .99) for m in data['metas']] if expt == "generate": data.save_all(results_folder) dill.dump(data, open(results_folder + 'data.pkl', 'w')) if pc == "mac": parallel = False else: parallel = True lview = setup_parallel(parallel) if inference_params.separate: init_state = wrapper.make_init_for_mh(inference_params.vars_to_init, data, specified_start) q_results = wrapper.mh_separate(data['beliefs'], data['metas'], inference_params, lview, init_state, results_folder, log_folder, return_stats) indiv_results = None else: if inference_params.num_chains > 1: sys.exit("Multiple chains not implemented for non-separate qs") results = wrapper.mh_across(data['beliefs'], data['metas'], inference_params, lview, results_folder, log_folder, return_stats) q_results, indiv_results = results print(datetime.datetime.now()) return q_results, indiv_results, data
def add_tournament_replay(self, replay_file_path): # Ignore duplicate replays replay_id = hashlib.sha1( open(replay_file_path, mode = 'rb').read() ).hexdigest() if replay_id in self.replay_ids: self.logger.warning('Replay already exists in database - {}'.format(replay_file_path)) return replay = self.parser.load_replay(replay_file_path) if not replay: return self.logger.warning('Adding - {}'.format(replay_file_path)) self.replay_ids.add(replay_id) # Add new information to the tree for player in replay.players: node_kwargs = { 'player_name' : player.name, 'player_race' : player.play_race, 'player_url' : player.url, 'hotkey_info' : self.parser.extract_hotkey_info(replay, player), } node = Node(**node_kwargs) self.tree_nodes.append(node) self.tree.add(node) # Serialize with open(constants.PATH_REPLAY_IDS, mode = 'wb') as f: pickle.dump(self.replay_ids, f) with open(constants.PATH_TREE_NODES, mode = 'wb') as f: pickle.dump(self.tree_nodes, f)
def main(): if len(sys.argv) != 3: usage() gtf, fa = sys.argv[1:3] try: with open('gencode_transcript_ids.pkl', 'r') as pklf: transcript_ids = dill.load(pklf) except IOError: transcript_ids = set() gtf_file = HTSeq.GFF_Reader(gtf) for feature in gtf_file: if feature.type == 'transcript': transcript_ids.add(feature.attr['transcript_id']) with open('gencode_transcript_ids.pkl', 'w') as pklf: dill.dump(transcript_ids, pklf) print '\n\n### Finished reading in GTF ###\n\n' fa_file = HTSeq.FastaReader(fa) fa_fout = open('output.fa', 'w') for fa in fa_file: ids = fa.name.split('|') if ids[0] in transcript_ids: fa.write_to_fasta_file(fa_fout) fa_fout.close()
def get_price_history(pris_id, produkt_id): fname = price_history_path + 'price_history_{0}_{1}.dill'.format(str(pris_id), str(produkt_id)) if not os.path.exists(fname): delayer.delay() t = str(int(float(time.time())*1000)) # time stamp ID = str(int(int(produkt_id)/79))[:3] # bullshit ID url = 'https://www.prisjakt.nu/ajax/jsonajaxserver.php?m=get_prod_prishist&p={"pris_id":' + str(pris_id) + ',"produkt_id":' + str(produkt_id) + '}&t=' + t + '&id=' + ID goon = True n_tries = 0 while goon and n_tries < 10: try: r = requests.get(url) goon = False except: print 'Error number {0} in price history page'.format(str(n_tries + 1)) sleep(30) n_tries = n_tries + 1 with open(fname, 'wb') as out_file: dill.dump(r, out_file) else: with open(fname, 'rb') as in_file: r = dill.load(in_file) return r
def commit(): """ Writes the modified state to the currently being modified save file. """ print('committing changes') with open(os.path.join('save', save + '.sav'), 'wb') as saveFile: dill.dump(state, saveFile)
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): title = title.replace("DeerSlayer", 'Deerslayer', 1) if title in done: continue if sleep: time.sleep(sleep) author = '' # TODO: figure this out archived_local = archived if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) soup = get_soup(url, sleep=SLEEP) table = soup.find('div', id='block-booknavigation-3') or soup.find( 'div', id='block-block-4') # process plot summary plot_summ = None plot_cell = table.find('a', href=RE_PLOT_LINK) if plot_cell: plot_title = plot_cell.get_text() href = plot_cell['href'] if archived: plot_link = get_orig_url(href) plot_link = get_archived(plot_link, update_old) if 'archive.org' not in plot_link: # failed to retrieve archived version # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted time.sleep(5.0) archived_local = False else: plot_link = urllib.parse.urljoin(url, href) if 'Chapter' not in plot_title: plot_summ = process_plot(plot_link) if not plot_summ: print(' no plot summary found', plot_link) # process section summaries cells = table.find_all('a', href=RE_SUMM_LINK) if title == "The Brothers Karamazov": cells = sort_cells(cells) section_summs = [] if not cells: print(' no section links found for', url) continue seen_sects = set() for c in cells: section_title = get_clean_text(c) section_title_chap = section_title.rsplit(':', 1)[-1] if section_title_chap in seen_sects: print(' seen {} already, skipped'.format(section_title_chap)) continue if re.match(RE_PLOT, section_title): continue if archived and archived_local: link_summ = get_orig_url(c['href']) link_summ = get_archived(link_summ, update_old) else: link_summ = urllib.parse.urljoin(url, c['href']) try: page_summs = process_story(link_summ) except AttributeError: # page failed to load, try again print(' retrying after 5 seconds...') time.sleep(5.0) page_summs = process_story(link_summ) if page_summs: section_summs.extend(page_summs) seen_sects.add(section_title_chap) if not section_summs: print(' could not find summaries for {}'.format(title)) continue book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=plot_summ, source='novelguide', section_summaries=section_summs) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from novelguide'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
other_arguments = other_default_arguments() #tune_settings_dict = tuning_settings([],[],[],[]) tune_settings_dict = tuning_settings(dual_args_list,[],adapt_cov_arguments,other_arguments) tune_dict = tuneinput_class(input_dict).singleton_tune_dict() sampler1 = mcmc_sampler(tune_dict=tune_dict,mcmc_settings_dict=mcmc_meta,tune_settings_dict=tune_settings_dict) store_name = 'normal_fc1_sampler.pkl' sampled = False if sampled: sampler1 = pickle.load(open(store_name, 'rb')) else: sampler1.start_sampling() with open(store_name, 'wb') as f: pickle.dump(sampler1, f) #out = sampler1.start_sampling() mcmc_samples_hidden_in = sampler1.get_samples_alt(prior_obj_name="hidden_in",permuted=False) print(mcmc_samples_hidden_in["samples"].shape) print(mcmc_samples_hidden_in["samples"][0,10,5]) print(mcmc_samples_hidden_in["samples"][1,10,5]) #exit() mcmc_samples_hidden_out = sampler1.get_samples_alt(prior_obj_name="hidden_out",permuted=False) #print(mcmc_samples_beta["indices_dict"]) #exit() samples = mcmc_samples_hidden_in["samples"]
def matching(dillpath, n_iter): # データのロード with open("tmp/dills/" + dillpath + "parsed_test.dill", "rb") as f: prunned = dill.load(f) keys = prunned.keys() size = 0 # データの次元. 2*オブジェクト数 goal = 0 # 終了状態のステップ数 datas = {} for filename in keys: datas[filename] = [] goal = prunned[filename][-1] logTestName = dillpath[11] filepath = "tmp/log_test_" + logTestName + "/" + filename + ".csv" with open(filepath, "r", encoding="utf-8") as f: while True: line = f.readline().split(",") if len(line) < 2: break if size == 0: size = len(line[5:-1]) datas[filename].append([float(l) for l in line[5:-1]]) # 共通境界推定 output = {} before = {} for filename in keys: output[filename] = [] before[filename] = prunned[filename][0] while True: # 現段階の before を保存 for filename in keys: output[filename].append(before[filename]) after = {} for fn in keys: # before+e より大きい最小の step # before+e 以降に境界がないなら終了状態にする later = [s for s in prunned[fn] if s > before[fn]+e \ and isDefferent(datas[fn][before[fn]], datas[fn][s])] if len(later) > 0: after[fn] = later[0] else: after[fn] = prunned[fn][-1] for filename in before.keys(): print(str(before[filename]) + "\t--> " + str(after[filename])) # sleep(10) # 終了条件 flagList = [a == goal for a in after.values()] flag = reduce(lambda x, y: x and y, flagList) if flag == True: # 最終結果を出力する for filename in keys: output[filename].append(after[filename]) break # --------------------------------------------------- # 要するに,この部分を一回しかやらないのが間違ってる # ・サンプリングにより predict を取得 # ・predict を after にしてもう一度サンプリング # ・predict = after になるまで繰り返す # ・predict を output に追加 # ・before <- predict while True: # サンプリング学習を n_iter 回行う modelList = [] for _ in range(n_iter): # size 個サンプリングして連立方程式を解く keyList = list(keys) np.random.shuffle(keyList) X = [] y = [] for k in keyList[:size]: X.append(datas[k][before[k]]) y.append(datas[k][after[k]]) # ここの転置忘れてた X = np.array(X).T y = np.array(y).T Xinv = np.linalg.inv(X) A = y.dot(Xinv) # 解いた結果の A で全データに対して再現精度を求める res = 0 for k in keys: b = np.array(datas[k][before[k]]) a = np.array(datas[k][after[k]]) r = A.dot(b) res += np.linalg.norm(r - a) modelList.append((A, 1.0 / res)) sumexp = sum([m[1] for m in modelList]) # 次を推定する predict = {} for filename in keys: predict[filename] = [] for m in modelList: beforeData = datas[filename][before[filename]] beforeData = np.array(beforeData) predict[filename].append(m[0].dot(beforeData)) predict[filename][-1] *= m[1] / sumexp predict[filename] = sum(predict[filename]) # 推定結果に最も近い状態を datas から取得 selected = {} for filename in keys: # 各ステップの状態との距離を計算する p = np.array(predict[filename]) d = datas[filename] distList = [np.linalg.norm(np.array(l) - p) for l in d] # before ステップ以降のみを対象にする distList = distList[before[filename] + e:] # after を選ぶ段階で else によって 499 になっている場合 # [before+e:] に要素が存在しない.その時は # そのまま before を返す if len(distList) == 0: selected[filename] = before[filename] continue selected[filename] = distList.index(min(distList)) # before+e ステップ分抜かしているので足しておく selected[filename] += before[filename] + e # after == selected なら break if after == selected: break # after = predict にして元に戻す after = selected # --------------------------------------------------- # before <- selected before = selected with open("tmp/dills/" + dillpath + "matching.dill", "wb") as f: dill.dump(output, f) return output
f.addSentence(["今日", "も", "また", "人", "が", "死んだよ"]) f.addSentence(["今日", "も", "また", "雨", "が", "降ったよ"]) f.toPrint() f.eliminateSentence(["今日", "も", "また", "雨", "が", "降ったよ"]) f.toPrint() test = ["今日", "も", "また", "雨", "が", "降ったよ"] for i in range(1, 11): print(f.changeBoundary(test, i)) """ data = {} data["1"] = ["りんごぶどうみかんばななもも"] data["2"] = ["ももみかんばななりんごぶどう"] data["3"] = ["ぶどうばななみかんりんごもも"] data["4"] = ["ばななりんごみかんももぶどう"] data["5"] = ["みかんももばななりんごぶどう"] # print(f.reverseSentences(data)) for i in range(1): with open("tmp/RefactedRest_result.dill", "wb") as g: ptime = datetime.now().timestamp() dill.dump(f.executeParsing(data, 300), g) ptime = datetime.now().timestamp() - ptime f.toPrint() res = f.debug_result() for r in res: print(r + "\t\t: " + str(res[r]))
def main(**kwargs): logger.info("Your params:") logger.info(kwargs) # check compatibility if training is continued from previously saved model if kwargs['init_from'] is not None: logger.info("Check if I can restore model from {0}".format(kwargs['init_from'])) # check if all necessary files exist assert os.path.isdir(kwargs['init_from']), "%s must be a a path" % kwargs['init_from'] assert os.path.isfile(os.path.join(kwargs['init_from'], "config.pkl")), "config.pkl file does not exist in path %s" % kwargs['init_from'] assert os.path.isfile(os.path.join(kwargs['init_from'], "textdata.pkl")), "textdata.pkl file does not exist in path %s" % kwargs['init_from'] ckpt = tf.train.get_checkpoint_state(kwargs['init_from']) assert ckpt, "No checkpoint found" assert ckpt.model_checkpoint_path, "No model path found in checkpoint" # open old config and check if models are compatible with open(os.path.join(kwargs['init_from'], 'config.pkl'), 'rb') as f: saved_model_args = dill.load(f) need_be_same = ["cell_type", "num_hidden", "num_layers", "num_samples", "max_vocab_size"] for checkme in need_be_same: assert saved_model_args[checkme] == kwargs[checkme], "Command line argument and saved model disagree on '%s' " % checkme logger.info("Args checker. Load TextData") # open saved TextData textdata = TextData.load(os.path.join(kwargs['init_from'], 'textdata.pkl')) else: textdata = TextData(kwargs['data_path'], max_len=kwargs['max_len'], max_vocab_size=kwargs['max_vocab_size']) logger.info("Save config and textdata.") with open(os.path.join(kwargs['save_dir'], 'config.pkl'), 'wb') as f: dill.dump(kwargs, f) TextData.save(textdata, os.path.join(kwargs['save_dir'], 'textdata.pkl')) # Make triples. logger.info("Making triples") triples = textdata.make_triples(textdata.dataset) logger.info("Number of triples: {0}".format(len(triples[0]))) decay_steps = len(triples[0]) vocab_size = len(textdata.vocab) logger.info("actual vocab_size={0}".format(vocab_size)) model = SkipthoughtModel(kwargs['cell_type'], kwargs['num_hidden'], kwargs['num_layers'], kwargs['embedding_size'], vocab_size, kwargs['learning_rate'], kwargs['decay_rate'], decay_steps, kwargs['grad_clip'], kwargs['num_samples'], kwargs['max_len']) with tf.Session() as sess: init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver(tf.all_variables(), max_to_keep=20) if kwargs['init_from'] is not None: saver.restore(sess, ckpt.model_checkpoint_path) print("Restored from {0}".format(ckpt.model_checkpoint_path)) num_batches = len(triples[0])//kwargs['batch_size'] loss_history = [] for e in range(kwargs['num_epochs']): it = textdata.triples_data_iterator(triples[0], triples[1], triples[2], textdata.max_len, kwargs['batch_size'], shuffle=True) for b, batch in enumerate(it): train_op, loss, feed_dict = model.train_step(*batch) start_time = time.time() batch_loss, _ = sess.run([loss, train_op], feed_dict=feed_dict) batch_perplexity = math.exp(float(batch_loss)) if batch_loss < 300 else float("inf") end_time = time.time() loss_history.append(batch_loss) if b % kwargs['verbose'] == 0: print( "{}/{} (epoch {}), train_loss = {:.3f}, perplexity = {:.3f}, time/batch = {:.3f}" \ .format(e * num_batches + b, kwargs['num_epochs'] * num_batches, e, batch_loss, batch_perplexity, end_time - start_time)) if (e * num_batches + b) % kwargs['save_every'] == 0 \ or (e == kwargs['num_epochs']-1 and b == num_batches-1): # save for the last result checkpoint_path = os.path.join(kwargs['save_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=e * num_batches + b) with open(os.path.join(kwargs['save_dir'], 'loss_history.pkl'), 'wb') as f: dill.dump(loss_history, f) print("model & loss_history saved to {}".format(checkpoint_path))
train=False, transform=transform), batch_size=args.test_batch_size, shuffle=True, **kwargs) # Construct fingerprint patterns # Choose xs fp_dx = [np.random.rand(1, 1, 28, 28) * args.eps for i in range(args.num_dx)] # fp_dx = [np.zeros(1,1,28,28)*args.eps for i in range(args.num_dx)] # for i in range(args.num_dx): # k,l = random.randint(0,27), random.randint(0,27) pickle.dump(fp_dx, open(os.path.join(args.log_dir, "fp_inputs_dx.pkl"), "wb")) # Target ys # num_target_classes x num_perturb x num_class fp_target = -0.2357 * np.ones((args.num_class, args.num_dx, args.num_class)) for j in range(args.num_dx): for i in range(args.num_class): fp_target[i, j, i] = 0.7 pickle.dump(fp_target, open(os.path.join(args.log_dir, "fp_outputs.pkl"), "wb")) fp_target = util.np2var(fp_target, args.cuda) fp = Fingerprints()
def save(file, data, format=None, overwrite=False): """Save `data`. First the function checks if :param:data defines a `save()` method; if so, the method is called as `save(output_path)`. If this is successful, the function terminates. If the call is not successful, or :param:data does not define a `save()` method, then the function attempts to save to the formats defined by `format`. By default, only the 'numpy_repr' representation is saved, if `data` defines a numpy representation. Not only is the numpy representation format more future-proof, it can be an order of magnitude more compact. If the numpy_repr save is unsuccessful (possibly because `data` does not provide a `numpy_repr` method), then `save()` falls back to saving a plain (dill) pickle of 'data'. Parameters ---------- file: str Path name or file object. Note that the file extension is mostly ignored and will be replaced by the one associated with the format. This is to allow saving to multiple formats. data: Python object Data to save format: str The format in which to save the data. Possible values are: - 'npr' (default) Save with the numpy_repr format. This is obtained by calling the method 'nprepr' on the `data`. If this call fails, a warning is issued and the 'dill' format is used. Output file have the extension 'npr'. Objects using this format should implement the `from_nprepr` method. - 'repr' Call `repr` on the data and save the resulting string to file. The save will fail (and fall back to 'dill' format) if the `repr` is simply inherited from object, as simply saving the object address is not useful for reconstructing it. Still, there is no way of ensuring that the `repr` is sufficiently informative to reconstruct the object, so make sure it is before using this format. Output file have the extension 'repr'. Objects using this format should implement the `from_repr` method. - 'dill' A dill pickle. Output file has the extension 'dill' Formats can also be combined as e.g. 'npr+dill'. overwrite: bool If True, allow overwriting previously saved files. Default is false, in which case a number is appended to the filename to make it unique. Returns ------- List of output paths. List because many formats may be specified, leading to multiple outputs. """ if isinstance(format, str): selected_formats = format else: if format is None: typename = find_registered_typename(type(data)) else: if not isinstance(format, type): logger.error("The `format` argument should be either a string " "or type. Provided value: {}" "Attempting to infer type from data".format(format)) typename = find_registered_typename(type(data)) typename = find_registered_typename(format) if typename in _format_types: format = _format_types[typename] else: logger.error("Type '{}' has no associated format".format(typename)) format = 'npr' selected_formats = set(format.split('+')) # Check argument - format bad_formats = [f for f in selected_formats if f not in defined_formats] selected_formats = selected_formats.difference(bad_formats) if len(bad_formats) > 0: format_names = ["'" + f + "'" for f in defined_formats] bad_format_names = ["'" + f + "'" for f in bad_formats] formatstr = "format" if len(format_names) > 1: format_names = ", ".join(format_names[:-1]) + " and " + format_names[-1] if len(bad_format_names) > 1: formatstr = "formats" bad_format_names = ", ".join(bad_format_names[:-1]) + " and " + bad_format_names[-1] logger.warning("Unrecognized save {} {}.".format(formatstr, bad_format_names) + "Recognized formats are " + format_names) if len(selected_formats) == 0: logger.warning("Setting the format to {}.".format_names) # We don't want to throw away the result of a long calculation because of a # flag error, so instead we will try to save into every format and let the user # sort out the files later. format = '+'.join(format_names) get_output = None def set_str_file(filename): nonlocal get_output def _get_output(filename, ext, bytes, overwrite): return output(filename, ext, bytes, overwrite) get_output = _get_output # Check argument - file if isinstance(file, io.IOBase): thisfilename = os.path.realpath(file.name) if 'luigi' in os.path.basename(thisfilename): # 'file' is actually a Luigi temporary file luigi = True else: luigi = False filename = thisfilename # thisfilename used to avoid name clashes if not any(c in file.mode for c in ['w', 'x', 'a', '+']): logger.warning("File {} not open for writing; closing and reopening.") file.close() set_str_file(thisfilename) else: def _get_output(filename, ext, bytes, overwrite): # Check that the file object is compatible with the arguments, # and if succesful, just return the file object unmodified. # If it is not successful, revert to opening a file as though # a filename was passed to `save`. # TODO: Put checks in `dummy_file_context` fail = False if (os.path.splitext(os.path.realpath(filename))[0] != os.path.splitext(os.path.realpath(thisfilename))[0]): logger.warning("[iotools.save] Given filename and file object differ.") fail = True thisext = os.path.splitext(thisfilename)[1].strip('.') if not luigi and thisext != ext.strip('.'): # Luigi adds 'luigi' to extensions of temporary files; we # don't want that to trigger closing the file logger.warning("[iotools.save] File object has wrong extension.") fail = True if (bytes and 'b' not in file.mode or not bytes and 'b' in file.mode): if luigi: # Luigi's LocalTarget always saves to bytes, and it's # the Format class that takes care of converting data # (possibly text) to and back from bytes. logger.warning("\n" "WARNING [iotools]: Attempted to save a 'luigi' target with the wrong " "mode (binary or text). Note that Luigi targets " "always use the same mode internally; use the " "`format` argument to convert to/from in your code. " "In particular, LocalTarget writes in binary. " "Consequently, the file will not be saved as {}, " "but as {}; specify the correct value to `bytes` " "to avoid this message.\n" .format("bytes" if bytes else "text", "text" if bytes else "bytes")) else: logger.warning("[iotools.save] File object has incorrect byte mode.") fail = True if (overwrite and 'a' in file.mode): # Don't check for `not overwrite`: in that case the damage is already done logger.warning("[iotools.save] File object unable to overwrite.") fail = True if fail: logger.warning("[iotools.save] Closing and reopening file object.") file.close() set_str_file(thisfilename) return output(filename, ext, bytes, overwrite) else: return dummy_file_context(file) get_output = _get_output else: assert isinstance(file, PathTypes) filename = file set_str_file(file) # Ensure target directory exists dirname = os.path.dirname(filename) if dirname != "": os.makedirs(dirname, exist_ok=True) output_paths = [] # If data provides a "save" method, use that # This overrides the "format" argument – only exception is if save fails, # then we reset it to what it was and try the other formats if isinstance(data, ParameterSet): # Special case of data with `save` attribute _selected_formats_back = selected_formats selected_formats = [] # Don't save to another format if successful with get_output(filename, ext="", bytes=False, overwrite=overwrite) as (f, output_path): # Close the file since Parameters only accepts urls as filenames # FIXME: This introduces a race condition; should use `f` to save # This would require fixing the parameters package to # accept file objects in `save()` pass try: logger.info("Saving ParameterSet using its own `save` method...") data.save(output_path, expand_urls=True) except (AttributeError, PermissionError) as e: logger.warning("Calling the data's `save` method failed with '{}'." .format(str(e))) selected_formats = _selected_formats_back else: output_paths.append(output_path) elif hasattr(data, 'save'): _selected_formats_back = selected_formats selected_formats = [] # Don't save to another format if successful # See if this type is in the registered formats, so we can get the # expected extension typename = find_registered_typename(data) # Always returns a type name: if none is found, returns that of data format = _format_types.get(typename, None) if format is None or format not in defined_formats: ext = "" else: ext = defined_formats[format].ext with get_output(filename, ext=ext, bytes=False, overwrite=overwrite) as (f, output_path): # TODO: Use `f` if possible, and only `output_path` if it fails. pass try: logger.info("Saving data using its own `save` method...") data.save(output_path) except (AttributeError, PermissionError) as e: logger.warning("Calling the data's `save` method failed with '{}'." .format(str(e))) selected_formats = _selected_formats_back else: output_paths.append(output_path) # Save to all specified formats for name, formatinfo in defined_formats.items(): if name in ('npr', 'repr', 'brepr', 'dill'): # TODO: Define the save functions below at top level of module # and treat these formats as any other # Make sure 'dill' is still used as backup continue if name in selected_formats: if formatinfo.save is None: logger.error("Format '{}' does not define a save function" .format(name)) fail = True else: fail = False ext = formatinfo.ext try: with get_output(filename, ext, formatinfo.bytes, overwrite) as (f, output_path): formatinfo.save(f, data) except IOError: fail = True except Exception as e: logger.error("Silenced uncaught exception during saving process to attempt another format.") logger.error("Silenced exception was: " + str(e)) fail = True else: output_paths.append(output_path) if fail: try: os.remove(output_path) # Ensure there are no leftover files except: pass logger.warning("Unable to save to {} format." .format(name)) if 'dill' not in selected_formats: # Warn the user that we will use another format logger.warning("Will try a plain (dill) pickle dump.") selected_formats.add('dill') # Save data as numpy representation if 'npr' in selected_formats: fail = False ext = defined_formats['npr'].ext try: with get_output(filename, ext, True, overwrite) as (f, output_path): try: logger.info("Saving data to 'npr' format...") np.savez(f, **data.repr_np) except AttributeError: fail = True else: output_paths.append(output_path) except IOError: fail = True if fail: # TODO: Use custom error type try: os.remove(output_path) # Ensure there are no leftover files except: pass logger.warning("Unable to save to numpy representation ('npr') format.") if 'dill' not in selected_formats: # Warn the user that we will use another format logger.warning("Will try a plain (dill) pickle dump.") selected_formats.add('dill') # Save data as representation string ('repr' or 'brepr') for format in [format for format in selected_formats if format in ('repr', 'brepr')]: bytes = (format == 'brepr') fail = False if data.__repr__ is object.__repr__: # Non-informative repr -- abort fail = True else: ext = defined_formats['repr'].ext try: with get_output(filename, ext=ext, bytes=bytes, overwrite=overwrite) as (f, output_path): try: logger.info("Saving data to plain-text 'repr' format'") f.write(repr(data)) except: fail = True else: output_paths.append(output_path) except IOError: fail = True if fail: try: os.remove(output_path) # Ensure there are no leftover files except: pass logger.warning("Unable to save to numpy representation ('npr') format.") if 'dill' not in selected_formats: # Warn the user that we will use another format logger.warning("Will try a plain (dill) pickle dump.") selected_formats.add('dill') # Save data in dill format if 'dill' in selected_formats: ext = defined_formats['dill'].ext try: with get_output(filename, ext, True, overwrite) as (f, output_path): logger.info("Saving data as a dill pickle.") dill.dump(data, f) output_paths.append(output_path) except IOError: # There might be other things to save, so don't terminate # execution because this save failed try: os.remove(output_path) # Ensure there are no leftover files except: pass logger.warning("Unable to save picke at location {}." .format(output_path)) # Return the list of output paths return [Path(path) for path in output_paths]
def handle(self): "Called by TCPServer for each client connection request" try: while True: msg = custompickle.load(self.rfile); #logging.debug("ROProxy {} {:0.20f}".format(msg, time.time())); #First message from client stub, check if object exists or not. if(msg == ROMessages._INIT_): robjName = custompickle.load(self.rfile); #logging.debug("_INIT_ message to look for object {}".format(robjName)); if(ROMgrObj.has(robjName)): self.obj = ROMgrObj.get(robjName, self); #On success, send the id of the proxy. custompickle.dump(id(self), self.wfile); self.wfile.flush(); self._robjName = robjName; else: logging.warning("_INIT_ message object {} not found".format(robjName)); custompickle.dump(ROMessages._NOT_FOUND_, self.wfile); self.wfile.flush(); #Check if the return should be compressed or not. elif(msg != ROMessages._COMPRESS_): #logging.debug("RemoteMethod: {} is not a compress directive.".format(msg)); #Request for an attribute if(msg == ROMessages._GET_ATTRIBUTE_): item = custompickle.load(self.rfile); try: val = self.obj.__getattribute__(item); custompickle.dump(None,self.wfile); custompickle.dump(val, self.wfile); self.wfile.flush(); except Exception as e: #An exception occured. send traceback info the client stub. custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush(); #Regular client stub messages contain the name of the function to be invoked and any arguments. else: #logging.debug("ROProxy {} reading args time {:0.20f}".format(msg, time.time())); args = custompickle.load(self.rfile); kwargs = custompickle.load(self.rfile); #logging.debug("ROProxy {} read args time {:0.20f}".format(msg, time.time())); #Execute the function locally and send back any results/exceptions. try: #Execute the local function, store the results. func = self.obj.__getattribute__(msg); if(inspect.ismethod(func)): result = func(*args, **kwargs); args = kwargs = None; else: #This is probably a property, in which case we already have the value, return it. result = func; #logging.debug("ROProxy {} local result time {:0.20f}".format(msg, time.time())); #No exception to report. custompickle.dump(None,self.wfile);#self.wfile.flush(); #logging.debug("ROProxy {} exception send time {:0.20f}".format(msg, time.time())); #Return the results. custompickle.dump(result, self.wfile); self.wfile.flush(); #logging.debug("ROProxy {} result send time {:0.20f}".format(msg, time.time())); #Hand shake to make sure this function scope is active till the other side has setup remote object stubs if any #the contents of this message is irrelevant to us. #NOT REQUIRED: this object reference (result) is alive in this space till next remote function call reaches it. #custompickle.load(self.rfile); except Exception as e: #An exception occured. send traceback info the client stub. custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush(); else: msg = custompickle.load(self.rfile); #logging.debug("RemoteMethod : request for compressing {}".format(msg)); #Request for an attribute if(msg == ROMessages._GET_ATTRIBUTE_): item = custompickle.load(self.rfile); try: val = self.obj.__getattribute__(item); custompickle.dump(None, self.wfile); self.wfile.flush(); AConfig.NTWKCHANNEL.transmit(val, self.wfile); except Exception as e: #An exception occured. send traceback info the client stub. custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush(); #Regular client stub messages contain the name of the function to be invoked and any arguments. else: #logging.debug("ROProxy {} reading args time {:0.20f}".format(msg, time.time())); args = custompickle.load(self.rfile); kwargs = custompickle.load(self.rfile); #logging.debug("ROProxy {} read args time {:0.20f}".format(msg, time.time())); #Execute the function locally and send back any results/exceptions. try: #Execute the local function, store the results. func = self.obj.__getattribute__(msg); if(inspect.ismethod(func)): result = func(*args, **kwargs); args = kwargs = None; else: #This is probably a property, in which case we already have the value, return it. result = func; #logging.debug("ROProxy {} local result time {:0.20f}".format(msg, time.time())); #No exception to report. custompickle.dump(None,self.wfile);self.wfile.flush(); #logging.debug("ROProxy {} exception send time {:0.20f}".format(msg, time.time())); #Return the results. AConfig.NTWKCHANNEL.transmit(result, self.wfile); #logging.debug("ROProxy {} result send time {:0.20f}".format(msg, time.time())); #Hand shake to make sure this function scope is active till the other side has setup remote object stubs if any #the contents of this message is irrelevant to us. #NOT REQUIRED: this object reference (result) is alive in this space till next remote function call reaches it. #custompickle.load(self.rfile); except Exception as e: #An exception occured. send traceback info the client stub. custompickle.dump(sys.exc_info(), self.wfile);self.wfile.flush(); #logging.debug("ROProxy {} exit time {:0.20f}".format(msg, time.time())); except EOFError: pass;
def save(self, fnm): with open(fnm, "wb") as f: pickle.dump((self.serialize(), self.stats), f)
args = parser.parse_args() catalog = load_catalog(CATALOG_NAME) if args.full: title_set = None else: print('limiting to books from', CATALOG_NAME) title_set = set(catalog.keys()) if args.archived_list: books_list = get_archived(BOOKS_LIST) else: books_list = BOOKS_LIST title_url_map = get_title_url_map(books_list, title_set=title_set) print('{} book pages total'.format(len(title_url_map))) book_summaries = get_summaries(title_url_map, args.out_name, args.use_pickled, args.archived, args.update_old, args.save_every, args.sleep) # with open(args.out_name, 'rb') as f: # book_summaries = pickle.load(f) book_summaries_overlap = gen_gutenberg_overlap(book_summaries, catalog, filter_plays=True) book_summaries_overlap = manual_fix(book_summaries_overlap) book_summaries_overlap = manual_fix_individual(book_summaries_overlap) with open(args.out_name_overlap, 'wb') as f: pickle.dump(book_summaries_overlap, f) print('wrote to {}'.format(args.out_name_overlap))
def main(): """Runs the experiment.""" parser = argparse.ArgumentParser( description='Set up searching for sub-types to detect.') # positional command line arguments parser.add_argument('cohort', type=str, help='a TCGA cohort') parser.add_argument('classif', type=str, help='a classifier in HetMan.predict.classifiers') parser.add_argument('base_gene', type=str, help='a gene to cross with respect to') # optional command line arguments controlling the thresholds for which # individual mutations and how many genes' mutations are considered parser.add_argument('--freq_cutoff', type=int, default=10, help='sub-type sample frequency threshold') parser.add_argument('--max_genes', type=int, default=20, help='maximum number of mutated genes to consider') # optional command line arguments for what kinds of mutation sub-types to # look for in terms of properties and number of mutations to combine parser.add_argument( '--mut_levels', type=str, nargs='+', default=['Form_base', 'Exon', 'Protein'], help='the mutation property levels to consider in addition to `Genes`') parser.add_argument( '--comb_size', type=int, default=3, help='maximum number of individual mutations to combine' 'when searching for mutation sub-types') # optional command line argument controlling verbosity parser.add_argument('--verbose', '-v', action='store_true', help='turns on diagnostic messages') # parse the command line arguments, get the directory where found sub-types # will be saved for future use args = parser.parse_args() out_path = os.path.join(base_dir, 'output', args.cohort, args.classif, 'add', args.base_gene) if args.verbose: print("Looking for mutation sub-types in cohort {} composed of at " "most {} individual mutations with at least {} " "samples in total.\n".format(args.cohort, args.comb_size, args.freq_cutoff)) # log into Synapse using locally-stored credentials syn = synapseclient.Synapse() syn.cache.cache_root_dir = ("/home/exacloud/lustre1/CompBio/" "mgrzad/input-data/synapse") syn.login() # load the expression matrix for the given cohort from Broad Firehose, # load the MC3 variant call set from Synapse, find the mutations for the # samples that are in both datasets expr_data = get_expr_firehose(args.cohort, firehose_dir) mc3_data = get_variants_mc3(syn) expr_mc3 = mc3_data.loc[mc3_data['Sample'].isin(expr_data.index), :] gene_mc3 = expr_mc3.loc[expr_mc3['Gene'] == args.base_gene, :] expr_mc3 = expr_mc3.loc[~expr_mc3['Sample'].isin(gene_mc3['Sample']), :] # get the genes whose mutations appear in enough samples to pass the # frequency threshold gene_counts = expr_mc3.groupby(by='Gene').Sample.nunique() count_cutoff = int(args.freq_cutoff / args.comb_size) common_genes = set(gene_counts.index[gene_counts >= count_cutoff]) if args.verbose: print("Found {} candidate genes with at least {} potential " "mutated samples.".format(len(common_genes), count_cutoff)) if len(common_genes) >= args.max_genes: gene_counts = gene_counts[common_genes].sort_values(ascending=False) common_genes = set(gene_counts[:args.max_genes].index) if args.verbose: print("Too many genes found, culling list to {} genes which each " "have at least {} mutated samples.".format( args.max_genes, min(gene_counts[common_genes]))) cdata = VariantCohort(cohort=args.cohort, mut_genes=common_genes, mut_levels=['Gene'] + args.mut_levels, expr_source='Firehose', data_dir=firehose_dir, cv_prop=1.0, syn=syn) # intializes the list of found sub-types and the list of samples each # sub-type appears in use_mtypes = set() use_sampsets = set() search_level = 1 break_status = False # until we have not reached the limit of sub-type enumeration or run out # property level combinations to test... while (len(use_mtypes) < 6000 and not break_status and search_level <= 2**len(args.mut_levels)): # try a list of property level combinations and number of individual # variants to combine, where the complexity of the level combination # plus the variant count is held constant for lvl_combn, comb_size in zip( rev_powerset_slice(args.mut_levels, search_level), range(1, min(search_level + 1, args.comb_size + 1))): use_lvls = ['Gene'] + list(lvl_combn) if args.verbose: print("\nLooking for sub-types that are combinations " "of {} mutation(s) at levels {}...\n".format( comb_size, use_lvls)) # enumerates the sub-types consisting of a combination of the given # number of individual mutations at the given property levels sub_mtypes = cdata.train_mut.combtypes( comb_sizes=(comb_size, ), sub_levels=use_lvls, min_type_size=args.freq_cutoff) # finds the samples belonging to each enumerated sub-type that # hasn't already been found mtype_sampsets = { mtype: frozenset(mtype.get_samples(cdata.train_mut)) for mtype in sub_mtypes - use_mtypes } # removes the sub-types with so many mutated samples that there # are not enough negatively-labelled samples for classification mtype_sampsets = { mtype: sampset for mtype, sampset in mtype_sampsets.items() if len(sampset) <= (len(cdata.samples) - args.freq_cutoff) } sub_mtypes = sorted(list(mtype_sampsets)) if args.verbose: print("Found {} new sub-types!\n".format(len(sub_mtypes))) # if the list of remaining sub-types isn't too long... if len(sub_mtypes) < 5000: add_mtypes = set() for i, mtype in enumerate(sub_mtypes): if args.verbose and (i % 200) == 100: print("\nchecked {} sub-types\n".format(i)) # ...we remove each one whose set of mutated samples is # identical to that of a sub-type that was already found if mtype_sampsets[mtype] in use_sampsets: if args.verbose: print("Removing functionally duplicate MuType {}"\ .format(mtype)) else: add_mtypes.update({mtype}) use_sampsets.update({mtype_sampsets[mtype]}) use_mtypes |= add_mtypes elif len(sub_mtypes) > 60000: break_status = True search_level += 1 if args.verbose: print("\nFound {} total sub-types!".format(len(use_mtypes))) # save the list of found non-duplicate sub-types to file pickle.dump(sorted(list(use_mtypes)), open(os.path.join(out_path, 'tmp/mtype_list.p'), 'wb'))
return train_count, test_count, correct_count, output if __name__ == '__main__': opts = utils.getopt_for_naocanzhujiao(sys.argv[1:]) if '-g' in opts: print('Group L01: Azeri') print('Guo Yanzhe, 2571732') print('Zhai Fangzhou, 2566641') print('Zhu Dawei, 2549931') exit(0) if '-tr' in opts: ''' update train file path ''' train_file, test_file = dill.load(open(config.config_file, 'rb')) dill.dump((opts['-tr'], test_file), open(config.config_file, 'wb')) if '-te' in opts: ''' update test file path ''' train_file, test_file = dill.load(open(config.config_file, 'rb')) dill.dump((train_file, opts['-te']), open(config.config_file, 'wb')) if '-a' in opts: ''' perform task 1 and evaluate accuracy ''' # load data train_file, test_file = dill.load(open(config.config_file, 'rb')) train_data = utils.load_data(train_file) test_data = utils.load_data(test_file) ''' perform inflection ''' tr_c, te_c, co_c, _ = batch_inflect(train_data, test_data) ''' output accuracy ''' print('trained on: ' + train_file) print('- training instances : ' + str(tr_c))
def main(): parser = argparse.ArgumentParser( 'merge_test', description="Concatenates all of the output of an experiment.") # collect command line arguments parser.add_argument('use_dir', type=str) args = parser.parse_args() # load list of subgrouping tasks for this experiment with open(os.path.join(args.use_dir, 'setup', "muts-list.p"), 'rb') as f: muts_list = pickle.load(f) # concatenate cohort mutated statuses for each subgrouping pheno_dict = dict() for pheno_file in Path(args.use_dir, 'merge').glob("out-pheno_*.p.gz"): with bz2.BZ2File(pheno_file, 'r') as fl: pheno_dict.update(pickle.load(fl)) assert sorted(muts_list) == sorted(pheno_dict.keys()), ( "Tested mutations missing from list of mutations' sample statuses!") assert len({ len(phns) for phns in pheno_dict.values() }) == 1, ("Inconsistent number of samples across mutation phenotype data!") with bz2.BZ2File(os.path.join(args.use_dir, "out-pheno.p.gz"), 'w') as fl: pickle.dump(pheno_dict, fl, protocol=-1) # concatenate coefficient values for each subgrouping classification model coef_df = pd.DataFrame() for coef_file in Path(args.use_dir, 'merge').glob("out-coef_*.p.gz"): with bz2.BZ2File(coef_file, 'r') as fl: coef_data = pickle.load(fl) coef_df = pd.concat([coef_df, coef_data.sort_index(axis=1)]) assert sorted(muts_list) == sorted(coef_df.index), ( "Tested mutations missing from merged classifier coefficients!") with bz2.BZ2File(os.path.join(args.use_dir, "out-coef.p.gz"), 'w') as fl: pickle.dump(coef_df, fl, protocol=-1) # concatenate predicted labels made by each subgrouping model pred_df = pd.DataFrame() for pred_file in Path(args.use_dir, 'merge').glob("out-pred_*.p.gz"): with bz2.BZ2File(pred_file, 'r') as fl: pred_data = pickle.load(fl) pred_df = pd.concat([pred_df, pred_data]) assert sorted(muts_list) == sorted(pred_df.index), ( "Tested mutations missing from merged classifier predictions!") with bz2.BZ2File(os.path.join(args.use_dir, "out-pred.p.gz"), 'w') as fl: pickle.dump(pred_df, fl, protocol=-1) # concatenate subgrouping model tuning performances tune_dfs = [pd.DataFrame() for _ in range(3)] + [None] for tune_file in Path(args.use_dir, 'merge').glob("out-tune_*.p.gz"): with bz2.BZ2File(tune_file, 'r') as fl: tune_data = pickle.load(fl) if tune_dfs[3] is None: tune_dfs[3] = tune_data[3] else: assert tune_dfs[3] == tune_data[3], ( "Inconsistent mutation classifiers between gather tasks!") for i in range(3): tune_dfs[i] = pd.concat([tune_dfs[i], tune_data[i]]) for i in range(3): assert sorted(muts_list) == sorted(tune_dfs[i].index), ( "Tested mutations missing from merged tuning statistics!") with bz2.BZ2File(os.path.join(args.use_dir, "out-tune.p.gz"), 'w') as fl: pickle.dump(tune_dfs, fl, protocol=-1) # concatenate subgrouping model testing performances auc_df = pd.DataFrame() for auc_file in Path(args.use_dir, 'merge').glob("out-aucs_*.p.gz"): with bz2.BZ2File(auc_file, 'r') as fl: auc_data = pickle.load(fl) auc_df = pd.concat([auc_df, pd.DataFrame(auc_data)]) assert sorted(muts_list) == sorted(auc_df.index), ( "Tested mutations missing from merged classifier accuracies!") with bz2.BZ2File(os.path.join(args.use_dir, "out-aucs.p.gz"), 'w') as fl: pickle.dump(auc_df, fl, protocol=-1) # concatenate subgrouping model sub-sampled testing performances conf_list = pd.Series(dtype='object') for conf_file in Path(args.use_dir, 'merge').glob("out-conf_*.p.gz"): with bz2.BZ2File(conf_file, 'r') as fl: conf_data = pickle.load(fl) conf_list = conf_list.append(conf_data) assert sorted(muts_list) == sorted(conf_list.index), ( "Tested mutations missing from merged subsampled accuracies!") with bz2.BZ2File(os.path.join(args.use_dir, "out-conf.p.gz"), 'w') as fl: pickle.dump(conf_list, fl, protocol=-1) # concatenate model performances when transferred to other cohorts trnsf_preds = pd.DataFrame() for trnsf_file in Path(args.use_dir, 'merge').glob("trnsf-vals_*.p.gz"): with bz2.BZ2File(trnsf_file, 'r') as fl: trnsf_vals = pickle.load(fl) trnsf_preds = pd.concat([trnsf_preds, trnsf_vals]) assert sorted(muts_list) == sorted(trnsf_preds.index), ( "Tested mutations missing from merged transfer predictions!") with bz2.BZ2File(os.path.join(args.use_dir, "trnsf-preds.p.gz"), 'w') as fl: pickle.dump(trnsf_preds, fl, protocol=-1) trnsf_dict = dict() for trnsf_file in Path(args.use_dir, 'merge').glob("out-trnsf_*.p.gz"): with bz2.BZ2File(trnsf_file, 'r') as fl: trnsf_data = pickle.load(fl) for coh, trnsf_out in trnsf_data.items(): if coh not in trnsf_dict: trnsf_dict[coh] = { 'Samps': None, 'Pheno': dict(), 'AUC': pd.DataFrame() } if trnsf_dict[coh]['Samps'] is None: trnsf_dict[coh]['Samps'] = trnsf_out['Samps'] else: assert trnsf_dict[coh]['Samps'] == trnsf_out['Samps'], ( "Mismatching sample sets in tranfer cohort `{}`!".format( coh)) if coh != 'CCLE': trnsf_dict[coh]['Pheno'].update(trnsf_out['Pheno']) trnsf_dict[coh]['AUC'] = pd.concat( [trnsf_dict[coh]['AUC'], pd.DataFrame(trnsf_out['AUC'])]) with bz2.BZ2File(os.path.join(args.use_dir, "out-trnsf.p.gz"), 'w') as fl: pickle.dump(trnsf_dict, fl, protocol=-1)
def _hyperparameter_optimization(self, num_iterations=30, save_results=True, display_plot=False, batch_size=20, n_random_starts=10, use_TPU=False, transfer_model='Inception', cutoff_regularization=False, min_accuracy=None): """ min_accuracy: minimum value of categorical accuracy we want after 1 iteration num_iterations: number of hyperparameter combinations we try n_random_starts: number of random combinations of hyperparameters first tried """ self.min_accuracy = min_accuracy self.batch_size = batch_size self.use_TPU = use_TPU self.transfer_model = transfer_model self.cutoff_regularization = cutoff_regularization #import scikit-optimize libraries from skopt import gp_minimize from skopt.space import Real, Categorical, Integer from skopt.plots import plot_convergence from skopt.utils import use_named_args #declare the hyperparameters search space dim_epochs = Integer(low=1, high=10, name='epochs') dim_hidden_size = Integer(low=6, high=2048, name='hidden_size') dim_learning_rate = Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate') dim_dropout = Real(low=0, high=0.9, name='dropout') dim_fine_tuning = Categorical(categories=[True, False], name='fine_tuning') dim_nb_layers = Integer(low=1, high=3, name='nb_layers') dim_activation = Categorical(categories=['relu', 'tanh'], name='activation') dim_include_class_weight = Categorical(categories=[True, False], name='include_class_weight') dimensions = [ dim_epochs, dim_hidden_size, dim_learning_rate, dim_dropout, dim_fine_tuning, dim_nb_layers, dim_activation, dim_include_class_weight ] #read default parameters from last optimization try: with open( parentdir + '/data/trained_model/hyperparameters_search.pickle', 'rb') as f: sr = dill.load(f) default_parameters = sr.x print('parameters of previous optimization loaded!') except: #fall back default values default_parameters = [5, 1024, 1e-4, 0, True, 1, 'relu', True] self.number_iterations = 0 #declare the fitness function @use_named_args(dimensions=dimensions) def fitness(epochs, hidden_size, learning_rate, dropout, fine_tuning, nb_layers, activation, include_class_weight): self.number_iterations += 1 #print the hyper-parameters print('epochs:', epochs) print('hidden_size:', hidden_size) print('learning rate:', learning_rate) print('dropout:', dropout) print('fine_tuning:', fine_tuning) print('nb_layers:', nb_layers) print('activation:', activation) print('include_class_weight', include_class_weight) print() #fit the model self.fit(epochs=epochs, hidden_size=hidden_size, learning_rate=learning_rate, dropout=dropout, fine_tuning=fine_tuning, nb_layers=nb_layers, activation=activation, include_class_weight=include_class_weight, batch_size=self.batch_size, use_TPU=self.use_TPU, transfer_model=self.transfer_model, min_accuracy=self.min_accuracy, cutoff_regularization=self.cutoff_regularization) #extract fitness fitness = self.fitness print('CALCULATED FITNESS AT ITERATION', self.number_iterations, 'OF:', fitness) print() del self.model K.clear_session() return -1 * fitness # optimization self.search_result = gp_minimize( func=fitness, dimensions=dimensions, acq_func='EI', # Expected Improvement. n_calls=num_iterations, n_random_starts=n_random_starts, x0=default_parameters) if save_results: if not os.path.exists(parentdir + '/data/trained_models'): os.makedirs(parentdir + '/data/trained_models') with open( parentdir + '/data/trained_models/hyperparameters_dimensions.pickle', 'wb') as f: dill.dump(dimensions, f) with open( parentdir + '/data/trained_models/hyperparameters_search.pickle', 'wb') as f: dill.dump(self.search_result.x, f) print("Hyperparameter search saved!") if display_plot: plot_convergence(self.search_result) #build results dictionary results_dict = { dimensions[i].name: self.search_result.x[i] for i in range(len(dimensions)) } print('Optimal hyperameters found of:') print(results_dict) print() print('Optimal fitness value of:', -float(self.search_result.fun))
def saveProgress(self): saveFile = open("./" + self.player.name.lower() + ".txt", 'wb') print "Saving . . . " world = { "name": self.name, "description": self.description, "areas": [], "player": { "name": self.player.name, "description": self.player.description, "currentArea": self.player.currentArea.name, "health": self.player.health, "score": self.player.score, "inventory": [] } } #save items in players inventory for item in self.player.inventory: currItem = { "name": item.name, "description": item.description, "area": None, "moveable": item.moveable, "onSuccess": item.onSuccess, "onFailure": item.onFailure, "detailedDescription": item.detailedDescription, "onSuccessScripts": [], "onFailureScripts": [], "onUse": item.onUse, "onUseScripts": [] } #saving item scripts for items in players inventory for script in item.onSuccessScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currItem["onSuccessScripts"].append(currScript) for script in item.onFailureScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currItem["onFailureScripts"].append(currScript) for script in item.onUseScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currItem["onUseScripts"].append(currScript) world["player"]["inventory"].append(currItem) #save all the areas their transitions, and their items. for area in self.areas: currArea = { "name": area.name, "description": area.description, "transitions": [], "items": [] } for transition in area.transitions: currTransition = { "name": transition.name, "direction": transition.direction, "isPassable": transition.isPassable, "onSuccess": transition.onSuccess, "onFailure": transition.onFailure, "destination": transition.destination.name, "area": transition.area.name, "description": transition.description, "detailedDescription": transition.detailedDescription, "openedDescription": transition.openedDescription, "onSuccessScripts": [], "onFailureScripts": [], "onOpenScripts": [], "requirements": [] } #serializing scripts with pickle. for script in transition.onSuccessScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currTransition["onSuccessScripts"].append(currScript) for script in transition.onFailureScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currTransition["onFailureScripts"].append(currScript) for script in transition.onOpenScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currTransition["onOpenScripts"].append(currScript) #saving transition requirements for requirement in transition.requirements: currRequirement = {"requirement": requirement} currTransition["requirements"].append(currRequirement) currArea["transitions"].append(currTransition) for item in area.items: currItem = { "name": item.name, "description": item.description, "detailedDescription": item.detailedDescription, "area": area.name, "moveable": item.moveable, "onSuccess": item.onSuccess, "onFailure": item.onFailure, "onSuccessScripts": [], "onFailureScripts": [], "onUse": item.onUse, "onUseScripts": [] } for script in item.onSuccessScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currItem["onSuccessScripts"].append(currScript) for script in item.onFailureScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currItem["onFailureScripts"].append(currScript) for script in item.onUseScripts: serScript = pickle.dumps(script) currScript = {"script": serScript} currItem["onUseScripts"].append(currScript) currArea["items"].append(currItem) world["areas"].append(currArea) world = pickle.dump(world, saveFile) #world = json.dumps(world,indent=4, separators=(',', ': ')) saveFile.close() print "Progress saved in " + self.player.name + ".txt"
def lowercase(text): return text.lower() def expand_contractions(text): text = text.split() return ' '.join(list(map(lambda word: contractions[word] if word in contractions_keys else word, text))) def remove_symbols_punctuation(text): text = re.sub(delete_re_symbols.pattern, '', text) text = re.sub(replace_re_by_space.pattern, ' ', text) return text def remove_stop_words(text): text = text.split() filtered_sentence = [w for w in text if not w in stop_words] return filtered_sentence def text_lemmatization(text): wordnet_lemmatizer = WordNetLemmatizer() text = list(map(lambda word: wordnet_lemmatizer.lemmatize(word), text)) return text text = expand_contractions(text) text = lowercase(text) text = remove_symbols_punctuation(text) text = remove_stop_words(text) text = text_lemmatization(text) return ' '.join(text) dill.dump(process_text, open('serialized/process_text.sav', 'wb'))
gt_entry, pred_entry, ) ######################################################## fp = predict(gt_entry['gt_boxes'], gt_entry['gt_classes']) fp_pred = fp[all_rels[:, 0], all_rels[:, 1]] pred_cls_scores = fp_pred.max(1) pred_cls_inds = np.argsort(-pred_cls_scores) pred_cls_inds = pred_cls_inds[pred_cls_scores[pred_cls_inds] > 0][:100] pred_entry['pred_rel_inds'] = all_rels[pred_cls_inds] pred_entry['rel_scores'] = fp_pred[pred_cls_inds] pred_entry['pred_classes'] = gt_entry['gt_classes'] pred_entry['obj_scores'] = np.ones(pred_entry['pred_classes'].shape[0]) all_pred_entries['predcls'].append(pred_entry) evaluator['predcls'].evaluate_scene_graph_entry( gt_entry, pred_entry, ) img_offset += img_ids.max() + 1 evaluator['predcls'].print_stats() evaluator['sgcls'].print_stats() for mode, entries in all_pred_entries.items(): with open('caches/freqbaseline-{}-{}.pkl'.format('overlap' if MUST_OVERLAP else 'nonoverlap', mode), 'wb') as f: pkl.dump(entries, f)
# Load the diabetes dataset diabetes = datasets.load_diabetes() # ONLY USING 1 FEATURE FOR THIS EXAMPLE! # Use only one feature diabetes_X = diabetes.data[:, np.newaxis, 2] # Split the data into training/testing sets diabetes_X_train = diabetes_X[:-20] diabetes_X_test = diabetes_X[-20:] # Split the targets into training/testing sets diabetes_y_train = diabetes.target[:-20] diabetes_y_test = diabetes.target[-20:] # Create linear regression model model = linear_model.LinearRegression() # Train the model using the training sets model.fit(diabetes_X_train, diabetes_y_train) import dill as pickle pio_bundle = PioBundle(model) pio_bundle_pkl_path = 'pio_bundle.pkl' with open(pio_bundle_pkl_path, 'wb') as fh: pickle.dump(pio_bundle, fh)
def save(self, file_name): dill.dump(self, open(file_name, 'w'))
def write_state(self, file_no): self.file_no = file_no files = self.get_file_names(self.file_no) with open((files['PATH_TO_DOC_WRITTEN']), 'wb') as f: dill.dump(self.doc_written, f) # PATH_TO_DOC_COUNT = ROOT /'doc_count_{}'.format(file_no) dill.dump(self.doc_count, open(files['PATH_TO_DOC_COUNT'], 'wb')) # PATH_TO_INLINKS = ROOT /'INLINKS_{}'.format(file_no) dill.dump(self.inlinks, open(files['PATH_TO_INLINKS'], 'wb')) # PATH_TO_OUTLINKS = ROOT /'OUTLINKS_{}'.format(file_no) dill.dump(self.oulinks, open(files['PATH_TO_OUTLINKS'], 'wb')) # PATH_TO_TRAVERSED = ROOT /'traversed_{}'.format(file_no) dill.dump(self.traversed, open(files['PATH_TO_TRAVERSED'], 'wb')) # PATH_TO_VISITED = ROOT / 'visited_{}'.format(file_no) dill.dump(self.visited, open(files['PATH_TO_VISITED'], 'wb')) # PATH_TO_ROBOT_DIC = ROOT / 'robot_dic_{}'.format(file_no) dill.dump(self.robot_dict, open(files['PATH_TO_ROBOT_DIC'], 'wb')) # PATH_TO_FRONTIER = ROOT / 'frontier_{}'.format(file_no) dill.dump(self.frontier, open(files['PATH_TO_FRONTIER'], 'wb')) # PATH_TO_AUX_FRONTIER = ROOT / 'aux_frontier_{}'.format(file_no) dill.dump(self.aux_ftier, open(files['PATH_TO_AUX_FRONTIER'], 'wb')) # PATH_TO_FILE_NUM = ROOT / 'file_num' dill.dump(file_no, open(PATH_TO_FILE_NUM, 'wb'))
pbar = tqdm(total=len(tok_sentence)) for list_tok in tok_sentence: tmp = [] list_tok = sub_space(list_tok) list_tok = sub_lol(list_tok) list_tok = pad_sentence(list_tok) list_tok = list(map(lambda word: clean.stripping(word), list_tok)) for tok in list_tok: if tok in itos: tmp.append(stoi[tok]) else: if tok not in unknown_words: unknown_words[tok] = 1 else: unknown_words[tok] += 1 tmp.append(stoi[unk_token]) new_int_sentence.append(tmp) pbar.update(1) pbar.close() return new_int_sentence pos_int = np.array(sen2int(pos_tok)) neg_int = np.array(sen2int(neg_tok)) # print(pos_int.shape, neg_int.shape) np.save(f'../dataset/{pos_name}_int.npy', pos_int) np.save(f'../dataset/{neg_name}_int.npy', neg_int) # print(unknown_words, len(unknown_words)) pickle.dump(unknown_words, open(f'../dataset/unknown.pkl', 'wb'))
if line[3] not in datas.keys(): datas[line[3]] = [] datas[line[3]].append(line[0]) line = f.readline().split(",") for d in datas: print("TYPE:" + str(d)) temp = "" for i, n in enumerate(datas[d]): temp = temp + n + "," if i % 10 == 0 and i != 0: print(temp + "\n") temp = "" print(temp + "\n") # タイプごとに名前を分類できたのでとりあえず保存しておく with open("type_names_dict.dill", "wb") as f: dill.dump(datas, f) with open("type_names_dict.dill", "rb") as f: datas = dill.load(f) # 各タイプのモデルを生成してみよう phist = PHist() phist.fit(datas) # どの程度正解するか見てみよう with open("result/result" + str(int(datetime.now().timestamp())) + ".txt", "w", encoding="utf-8") as res: with open("pokemon.csv", "r", encoding="utf-8") as f: line = f.readline().split(",") count = 0 success_fir = 0 success_sec = 0 success_rev = 0
for run in range(config.n_runs): ## use 33% for training and 67 % for validation ## so we switch trainInd and validInd for fold, (validInd, trainInd) in enumerate(skf[run]): print("Run: %d, Fold: %d" % (run + 1, fold + 1)) path = "%s/Run%d/Fold%d" % (config.feat_folder, run + 1, fold + 1) ######################### ## get word count feat ## ######################### for feat_name in feat_names: X_train = dfTrain[feat_name].values[trainInd] X_valid = dfTrain[feat_name].values[validInd] with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: dill.dump(X_train, f, -1) with open("%s/valid.%s.feat.pkl" % (path, feat_name), "wb") as f: dill.dump(X_valid, f, -1) print("Done.") # print("For training and testing...") # path = "%s/All" % config.feat_folder # ## use full version for X_train # extract_feat(dfTest) # for feat_name in feat_names: # X_train = dfTrain[feat_name].values # X_test = dfTest[feat_name].values # with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: # dill.dump(X_train, f, -1) # with open("%s/test.%s.feat.pkl" % (path, feat_name), "wb") as f:
def main(): import sys import os from datetime import datetime from joblib import Parallel, delayed import tempfile import dill from pynets.stats.utils import make_subject_dict, cleanNullTerms, \ get_ensembles_top, get_ensembles_embedding, \ build_grid from colorama import Fore, Style try: import pynets except ImportError: print( "PyNets not installed! Ensure that you are referencing the correct" " site-packages and using Python3.6+") if len(sys.argv) < 1: print("\nMissing command-line inputs! See help options with the -h" " flag.\n") sys.exit(1) # Parse inputs base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/triple' #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/outputs_language' thr_type = "MST" icc = True disc = False int_consist = False modality = 'dwi' embedding_types = ['OMNI'] #rsns = ['language'] rsns = ['kmeans', 'triple'] template = 'CN200' # template = 'MNI152_T1' mets = [ "global_efficiency", "average_shortest_path_length", "degree_assortativity_coefficient", "average_betweenness_centrality", "average_eigenvector_centrality", "smallworldness", "modularity" ] metaparams_func = ["rsn", "res", "model", 'hpass', 'extract', 'smooth'] metaparams_dwi = ["rsn", "res", "model", 'directget', 'minlength', 'tol'] sessions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] #### print(f"{Fore.LIGHTBLUE_EX}\nBenchmarking API\n") print(Style.RESET_ALL) print(f"{Fore.LIGHTGREEN_EX}Gathering sampled data...") print(Style.RESET_ALL) for embedding_type in embedding_types: subject_dict_file_path = (f"{base_dir}/pynets_subject_dict_{modality}_" f"{embedding_type}_{template}.pkl") subject_mod_grids_file_path = ( f"{base_dir}/pynets_modality_grids_{modality}_" f"{embedding_type}_{template}.pkl") missingness_summary = ( f"{base_dir}/pynets_missingness_summary_{modality}_" f"{embedding_type}_{template}.csv") icc_tmps_dir = f"{base_dir}/icc_tmps/{modality}_" \ f"{embedding_type}" os.makedirs(icc_tmps_dir, exist_ok=True) if not os.path.isfile(subject_dict_file_path): subject_dict, modality_grids, missingness_frames = \ make_subject_dict( [modality], base_dir, thr_type, mets, [embedding_type], template, sessions, rsns ) sub_dict_clean = cleanNullTerms(subject_dict) missingness_frames = [ i for i in missingness_frames if isinstance(i, pd.DataFrame) ] if len(missingness_frames) != 0: if len(missingness_frames) > 0: if len(missingness_frames) > 1: final_missingness_summary = pd.concat( missingness_frames) final_missingness_summary.to_csv(missingness_summary, index=False) final_missingness_summary.id = \ final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] elif len(missingness_frames) == 1: final_missingness_summary = missingness_frames[0] final_missingness_summary.to_csv(missingness_summary, index=False) final_missingness_summary.id = \ final_missingness_summary.id.astype( 'str').str.split('_', expand=True)[0] else: final_missingness_summary = pd.Series() else: final_missingness_summary = pd.Series() else: final_missingness_summary = pd.Series() with open(subject_dict_file_path, "wb") as f: dill.dump(sub_dict_clean, f) f.close() with open(subject_mod_grids_file_path, "wb") as f: dill.dump(modality_grids, f) f.close() else: with open(subject_dict_file_path, 'rb') as f: sub_dict_clean = dill.load(f) f.close() with open(subject_mod_grids_file_path, "rb") as f: modality_grids = dill.load(f) f.close() if os.path.isfile(missingness_summary): final_missingness_summary = pd.read_csv(missingness_summary) final_missingness_summary.id = \ final_missingness_summary.id.astype('str').str.split( '_', expand=True)[0] else: final_missingness_summary = pd.Series() ids = sub_dict_clean.keys() # print(f"MODALITY: {modality}") metaparams = eval(f"metaparams_{modality}") metaparam_dict = {} # print(f"EMBEDDING TYPE: {embedding_type}") # if os.path.isfile(f"{base_dir}/grid_clean_{modality}_{alg}.csv"): # continue if embedding_type == 'topology': ensembles, df_top = get_ensembles_top(modality, thr_type, f"{base_dir}/pynets") else: ensembles = get_ensembles_embedding(modality, embedding_type, base_dir) grid = build_grid(modality, metaparam_dict, sorted(list(set(metaparams))), ensembles)[1] grid = [ i for i in grid if '200' not in i and '400' not in i and '600' not in i and '800' not in i and 'triple' not in i ] good_grids = [] for grid_param in grid: grid_finds = [] for ID in ids: if ID not in sub_dict_clean.keys(): print(f"ID: {ID} not found...") continue if str(sessions[0]) not in sub_dict_clean[ID].keys(): print(f"Session: {sessions[0]} not found for ID {ID}...") continue if modality not in sub_dict_clean[ID][str(sessions[0])].keys(): print(f"Modality: {modality} not found for ID {ID}, " f"ses-{sessions[0]}...") continue if embedding_type not in \ sub_dict_clean[ID][str(sessions[0])][modality].keys(): print(f"Modality: {modality} not found for ID {ID}, " f"ses-{sessions[0]}, {embedding_type}...") continue if grid_param in \ list(sub_dict_clean[ID][str(sessions[0])][modality][ embedding_type].keys()): grid_finds.append(grid_param) if len(grid_finds) < 0.75 * len(ids): print(f"Less than 75% of {grid_param} found. Removing from " f"grid...") continue else: good_grids.append(grid_param) modality_grids[modality] = good_grids cache_dir = tempfile.mkdtemp() with Parallel(n_jobs=-1, require="sharedmem", backend='threading', verbose=10, max_nbytes='200000M', temp_folder=cache_dir) as parallel: outs = parallel( delayed(benchmark_reproducibility) (base_dir, comb, modality, embedding_type, sub_dict_clean, disc, final_missingness_summary, icc_tmps_dir, icc, mets, ids, template) for comb in grid) # outs = [] # for comb in grid: # outs.append(benchmark_reproducibility(base_dir, comb, modality, # embedding_type, sub_dict_clean, # disc, final_missingness_summary, icc_tmps_dir, icc, # mets, ids)) df_summary = pd.concat( [i for i in outs if i is not None and not i.empty], axis=0) df_summary = df_summary.dropna(axis=0, how='all') print(f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...") df_summary.to_csv( f"{base_dir}" f"/grid_clean_{modality}_{embedding_type}_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}" f".csv", index=False) # int_consist if int_consist is True and embedding_type == 'topology': try: import pingouin as pg except ImportError: print("Cannot evaluate test-retest int_consist. pingouin" " must be installed!") df_summary_cronbach = pd.DataFrame( columns=['modality', 'embedding', 'cronbach']) df_summary_cronbach.at[0, "modality"] = modality df_summary_cronbach.at[0, "embedding"] = embedding_type for met in mets: cronbach_ses_list = [] for ses in range(1, 10): id_dict = {} for ID in ids: id_dict[ID] = {} for comb in grid: if modality == 'func': try: extract, hpass, model, res, atlas, \ smooth = comb except BaseException: print(f"Missing {comb}...") extract, hpass, model, res, atlas = comb smooth = '0' comb_tuple = (atlas, extract, hpass, model, res, smooth) else: directget, minlength, model, res, atlas, \ tol = comb comb_tuple = (atlas, directget, minlength, model, res, tol) if comb_tuple in sub_dict_clean[ID][str( ses)][modality][embedding_type].keys(): if isinstance( sub_dict_clean[ID][str(ses)][modality] [embedding_type][comb_tuple], np.ndarray): id_dict[ID][comb] = sub_dict_clean[ID][str( ses)][modality][embedding_type][ comb_tuple][mets.index(met)][0] else: continue else: continue df_wide = pd.DataFrame(id_dict) if df_wide.empty is True: continue else: df_wide = df_wide.add_prefix(f"{met}_comb_") df_wide.replace(0, np.nan, inplace=True) print(df_wide) try: c_alpha = pg.cronbach_alpha(data=df_wide.dropna( axis=1, how='all'), nan_policy='listwise') cronbach_ses_list.append(c_alpha[0]) except BaseException: print('FAILED...') print(df_wide) del df_wide del df_wide df_summary_cronbach.at[0, f"average_cronbach_{met}"] = \ np.nanmean(cronbach_ses_list) print(f"Saving to {base_dir}/grid_clean_{modality}_" f"{embedding_type}_cronbach_" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...") df_summary_cronbach.to_csv( f"{base_dir}/grid_clean_{modality}_" f"{embedding_type}_cronbach" f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}" f".csv", index=False) return
help='for multiple scales, eg. [1.0, (1.1, 0.05)]') args = parser.parse_args() scales = ast.literal_eval(args.scales) w, h = model_wh(args.resolution) e = TfPoseEstimator(get_graph_path(args.model), target_size=(w, h)) files_grabbed = glob.glob(os.path.join(args.folder, '*.jpg')) all_humans = dict() for i, file in enumerate(files_grabbed): # estimate human poses from a single image ! image = common.read_imgfile(file, None, None) t = time.time() #humans = e.inference(image, upsample_size=scales) humans = e.inference(image) elapsed = time.time() - t logger.info('inference image #%d: %s in %.4f seconds.' % (i, file, elapsed)) image = TfPoseEstimator.draw_humans(image, humans, imgcopy=False) cv2.imshow('tf-pose-estimation result', image) cv2.waitKey(5) all_humans[file.replace(args.folder, '')] = humans with open(os.path.join(args.folder, 'pose.dil'), 'wb') as f: dill.dump(all_humans, f, protocol=dill.HIGHEST_PROTOCOL) print(all_humans)
df = pd.DataFrame(df) y = df['stars'].tolist() af = pipeline.FeatureUnion([ ('rest', EstTransformer()), ('rshe', SheTransformer()), ('rcat', CatTransformer()), ('ratt', AttTransformer()), ]) all_pipe = pipeline.Pipeline([ ('features', af), ('lasso', linear_model.LinearRegression(fit_intercept=True)) ]) print('fitting') all_pipe.fit(df, y) print('fitting complete') with open('../pickle/all.dill', "wb") as f: dill.dump(all_pipe, f) test = df.sample() test_dict = [test.attributes] pdb.set_trace() #print (att_pipe.predict(test_dict)) #print (test['stars'])
from app.preprocessing.preprocess import Preprocess import dill preprocessed = Preprocess() dill.dump(preprocessed, open('preprocessed.p', 'wb'))
batch_end = (i + 2) * batch_size batch_end = np.min([batch_end, X.shape[0]]) X_batch = X[batch_st:batch_end, :] y_batch = y[batch_st:batch_end, :] for model in range(len(models)): lst_actions[model] = simulate_rounds_stoch(models[model], lst_rewards[model], lst_actions[model], X_batch, y_batch, rnd_seed=batch_st) for model in range(len(models)): dill.dump(models[model], open("model_%d_loc7.dill" % (model), "wb")) #plotting import matplotlib.pyplot as plt from pylab import rcParams def get_mean_reward(reward_lst, batch_size=batch_size): mean_rew = list() for r in range(len(reward_lst)): mean_rew.append(sum(reward_lst[:r + 1]) * 1.0 / ((r + 1) * batch_size)) return mean_rew import scipy.stats as st
def _read(self, file_path: str): if not file_path.endswith('.json'): raise ConfigurationError( f"Don't know how to read filetype of {file_path}") cache_dir = os.path.join('cache', file_path.split("/")[-1]) if self._load_cache: logger.info(f'Trying to load cache from {cache_dir}') if self._save_cache: os.makedirs(cache_dir, exist_ok=True) cnt = 0 with open(file_path, "r") as data_file: json_obj = json.load(data_file) for total_cnt, ex in enumerate(json_obj): cache_filename = f'instance-{total_cnt}.pt' cache_filepath = os.path.join(cache_dir, cache_filename) if self._loading_limit == cnt: break if self._load_cache: try: ins = dill.load(open(cache_filepath, 'rb')) if ins is None and not self._keep_if_unparsable: # skip unparsed examples continue yield ins cnt += 1 continue except Exception as e: # could not load from cache - keep loading without cache pass query_tokens = None if 'query_toks' in ex: # we only have 'query_toks' in example for training/dev sets # fix for examples: we want to use the 'query_toks_no_value' field of the example which anonymizes # values. However, it also anonymizes numbers (e.g. LIMIT 3 -> LIMIT 'value', which is not good # since the official evaluator does expect a number and not a value ex = fix_number_value(ex) # we want the query tokens to be non-ambiguous (i.e. know for each column the table it belongs to, # and for each table alias its explicit name) # we thus remove all aliases and make changes such as: # 'name' -> 'singer@name', # 'singer AS T1' -> 'singer', # 'T1.name' -> 'singer@name' try: query_tokens = disambiguate_items( ex['db_id'], ex['query_toks_no_value'], self._tables_file, allow_aliases=False) except Exception as e: # there are two examples in the train set that are wrongly formatted, skip them print(f"error with {ex['query']}") print(e) ins = self.text_to_instance(utterance=ex['question'], db_id=ex['db_id'], sql=query_tokens) if ins is not None: cnt += 1 if self._save_cache: dill.dump(ins, open(cache_filepath, 'wb')) if ins is not None: yield ins
return l crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=file_name + "-pos-new.model2") crf.fit(X, y) labels = list(crf.classes_) labels.remove('O') y_pred = crf.predict(X_test) e = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) print(e) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) # cross_validate f1_scorer = make_scorer(metrics.flat_f1_score, average='macro') scores = cross_validate(crf, X, y, scoring=f1_scorer, cv=5) # save data import dill with open("datatrain.data", "wb") as dill_file: dill.dump(datatofile, dill_file)