def save(self, initial=False, string=None): """ Save current model to disk. """ out_path = os.path.join(self.config['output_dir'], self.config['model_name']) logger.info('Saving model to {}'.format(out_path)) try: os.mkdir(out_path) except OSError: # Directory already exists pass if initial: dump_json(self.config, '{}/{}.json'.format(out_path, self.config['model_name'])) self.model.save('{}/{}.npz'.format(out_path, self.config['model_name'])) else: if string is None: # We received a special string (most likely from a different # task in a multitask setup. This string will tell us how to # write the suffix for saving this model. model_name = '{}.{:02f}.{}'.format(self.config['model_name'], self.best_retrieval, self.epoch + 1) self.model.save('{}/{}.npz'.format(out_path, model_name)) self.prune_saved_models('{}/{}.npz'.format( out_path, model_name))
def correct_confirmed_errors_in_filelist(confirmed_errors_json): errors = load_json(confirmed_errors_json) corrected = {} for wavname in errors: corrected[wavname] = errors[wavname]["ground_truth"] dump_json(corrected, confirmed_errors_json)
def train_model(): global epoch_information model.train() max_epoch = 120 N = [idx for idx in range(100, 100 + max_epoch)] for batch in train_loader: optimizer.zero_grad() if epoch <= 50 or random.random() >= params.mix_active: loss, _ = model(batch) else: loss, _ = model.forward_active(batch) loss.backward() optimizer.step() model.eval() scores = [] for idx in N: scores.append(test_model(id_=idx)[2]) final_score = sum(scores) / (len(N) + 1e-8) file_name = 'model/model_task_4_' + params.file_name + '_e_' + str( epoch) + '.pt' heapq.heappush(epoch_information, (final_score, file_name)) remove_filename = heapq.heappop( epoch_information)[1] if len(epoch_information) > 5 else None if file_name != remove_filename: torch.save({'model_state_dict': model.state_dict()}, file_name) if remove_filename: os.remove(remove_filename) dump_json('model/' + params.file_name + '.json', epoch_information) print('Test_Epoch: {} Scores are: {}'.format(epoch, scores)) model.train()
def run(args): parse_str = lambda s: tuple(map(int, s.split(","))) nnet = Nnet(**nnet_conf) trainer = GE2ETrainer( nnet, gpuid=parse_str(args.gpu), checkpoint=args.checkpoint, resume=args.resume, **trainer_conf) loader_conf = { "M": args.M, "N": args.N, "chunk_size": parse_str(args.chunk_size) } for conf, fname in zip([nnet_conf, trainer_conf, loader_conf], ["mdl.json", "trainer.json", "loader.json"]): dump_json(conf, args.checkpoint, fname) train_loader = SpeakerLoader( train_dir, **loader_conf, num_steps=args.train_steps) dev_loader = SpeakerLoader( dev_dir, **loader_conf, num_steps=args.dev_steps) trainer.run(train_loader, dev_loader, num_epochs=args.epochs)
async def purge_rr(self): if self.purge_rr.current_loop == 0: return print("Purging rr logs...") for log_file in glob.glob(utils.REACTION_ROLE_LOG_DIR + "*.json"): # load log log = utils.load_json_with_default(log_file, default=False) # mark entries for purging ch_dels, m_dels = [], [] for ch_id in log: # check channel channel = self.bot.get_channel(ch_id) if not channel: ch_dels.append(ch_id) #check message for m_id in log: try: await channel.fetch_message(m_id) except discord.NotFound: m_dels.append((ch_id, m_id)) # do purging for tup in m_dels: del log[tup[0]][tup[1]] for ch in ch_dels: del log[ch] # save log utils.dump_json(log, log_file)
def main(): args = sys.argv # Load data info data = load_json(args[1]) goldfile = data['gold']['file'] # Iterate over data files for key, value in data.items(): # Check allignment if key == "spacy" or key == "stanford": if not alligned(value['file'], goldfile): continue # Preprocess file outputfilename = value['file'].replace( value['extension'], '-preprocessed' + value['extension']) preprocess_file(value['file'], value['annotation_column'], value['header'], args[2], outputfilename) # Update data-information of preprocessed file value['file'] = outputfilename value['annotation_column'] = value['header'][ value['annotation_column']] # Write updated data info to outfile dump_json(args[1].replace('.json', '-preprocessed.json'), data)
def train_test_system(features_names_list, data_info_file, system, output_filename): """ This function trains a classifier based on given system and data information, tests this classifier system and writes the predictions to an outputfile. :param features_names_list: list of indications of all feature columns that should be used :param data_info_file: path to file containing info about all necessary data :param system: name of the ML algorithm that is passed to the classifier :param output_filename: path to conll outputfile :type feature_names_list: list :type data_info_file: string :type system: string :type output_filename: string """ data = load_json(data_info_file) # Train model inputfile = data['training']['file'] annotation_column = data['training']['annotation_column'] model = TextClassifier(system) model.train(inputfile, features_names_list, annotation_column) # Classify gold_file = data['gold']['file'] predictions = model.predict(gold_file) # Write output append_column_and_write_file(output_filename, gold_file, predictions, 'predictions') # Update data info name = os.path.basename(output_filename[:-6]) data[name] = {'annotation_column': 'predictions', 'file': output_filename} dump_json(data_info_file, data)
def convert_names_filelist_json_to_json(filelist_json, out_json, wavnames_correspondence_json=None): initial_filelist = load_json(filelist_json) converted_filelist = convert_names_filelist_dict_to_dict( initial_filelist, wavnames_correspondence_json) dump_json(converted_filelist, out_json)
def get_cat2(): # d = parse_cat_1(URL + BJCP_URL) # dump_json(d, 'cat.json') cats = read_json(data_dir + 'cat.json') for title, url in cats.items(): d = parse_cat_2(URL + url) dump_json(d, data_dir + title + '.json')
def merge_paper_info(mission): if mission == 'test': paper_keywords = load_json('data/test/paper_keywords.json') paper_title = load_json('data/test/paper_title.json') paper_org = load_json('data/test/paper_org.json') papers = [i for i in paper_title] papers_info_merge = {} for i in papers: sentence = '' if paper_org[i] != []: for j in paper_org[i]: sentence += j sentence += '' if paper_keywords[i] != []: for j in paper_keywords[i]: sentence += j sentence += ' ' sentence += paper_title[i] papers_info_merge[i] = sentence dump_json('data/test/paper_info_merge.json', papers_info_merge) print('paper info merge done')
def main(): """ main function """ in_dir = "in" utils.is_exists_dir(in_dir, True) out_dir = "out" utils.is_exists_dir(out_dir, True) log_dir = 'log' utils.is_exists_dir(log_dir, True) prefix_out_file = "zudello_intrfc_" #extension_for_in_files = 'json' log_filename = os.path.join(log_dir, 'error.log') logging.basicConfig(filename=log_filename, filemode='a', format='%(asctime)s - %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger() in_files = os.listdir(in_dir) for in_file in in_files: print("File: {}".format(in_file)) json_content = utils.get_json_from_file(os.path.join(in_dir, in_file), logger) if json_content: out_interface = proc_json(json_content) # for k, v in generator_node(json_content): # print("{} - {}".format(k, v)) # out_interface = convert_to_interface(json_content) out_file_name = os.path.join(out_dir, prefix_out_file + in_file) utils.dump_json(out_file_name, out_interface)
async def merge_items(): # get data if not os.path.exists(utils.SUPER_ITEM_FILE): _, s_data = await SuperScraper.parse() else: s_data = utils.load_json_with_default(utils.SUPER_ITEM_FILE, default=False) if not os.path.exists(utils.KEDAMA_ITEM_FILE): _, k_data = KedamaScraper.parse() else: k_data = utils.load_json_with_default(utils.KEDAMA_ITEM_FILE, default=False) if not os.path.exists(utils.MARKET_ITEM_FILE): _, h_data = await MarketScraper.scrape() else: h_data = utils.load_json_with_default(utils.MARKET_ITEM_FILE, default=False) # merge data merged_data = [] for x, y in [('super', s_data), ('kedama', k_data)]: for z in y: z['type'] = x merged_data.append(z) for x in h_data: h_data[x]['type'] = "market" merged_data.append(h_data[x]) # dump utils.dump_json(merged_data, utils.ITEM_FILE) return merged_data
def add_question_ids(infile, subject_metadata): question_data = open_json(infile) max_q = 0 for q_id in question_data: subjects = question_data[q_id]['subjects'] new_subject_map = [subject_metadata[d]['new_id'] for d in subjects] child_subjects = [] for d1 in subjects: is_ok = True for d2 in subjects: if d1 == d2: continue if d1 in subject_metadata[d2]['parents']: is_ok = False break if is_ok: child_subjects.append(d1) question_data[q_id]['new_sub_map'] = new_subject_map child_subject_map = [ subject_metadata[d]['new_id'] for d in child_subjects ] question_data[q_id]['child_map'] = child_subject_map question_data[q_id]['childs'] = child_subjects child_whole_map = [] for child in child_subjects: parent = subject_metadata[child]['parents'] parent = [d for d in parent if d] parent = [subject_metadata[d]['new_id'] for d in parent] child_whole_map.append(parent) question_data[q_id]['child_whole_map'] = child_whole_map max_q = max(len(child_whole_map), max_q) print(max_q) dump_json(infile, question_data)
async def get_series_data(self, update, session=None): # inits name = update['series'] link = update['series_link'] DATA = utils.load_json_with_default(utils.SERIES_CACHE, {}) CONFIG = utils.load_bot_config() # refresh data if new series or not updated in a while if name in DATA: last_check = time.time() - DATA[name].get('last_checked', 0) if name not in DATA or last_check > CONFIG[ 'series_refresh_rate'] * 24 * 60 * 60: # get data html = await get_html(link, session) soup = BeautifulSoup(html, 'html.parser') s_data = self.parse_series_page(soup, update) s_data['link'] = link # cache DATA[name] = s_data DATA[name]['last_checked'] = time.time() utils.dump_json(DATA, utils.SERIES_CACHE) return DATA[name]
def convert_subjects(): file_name = 'public_data/metadata/subject_metadata.csv' output_data = 'public_data/personal_data/subject_metadata.json' data = {} cnt = 1 with open(file_name, 'r') as fp: lines = fp.readlines()[1:] lines = [line.strip('\n') for line in lines] for line in lines: words = line.split(',') subject_id = int(words[0]) if words[-2] == 'NULL': parent_id = 0 else: parent_id = int(words[-2]) level = int(words[-1]) name = ','.join(words[1:-2]) data[subject_id] = { 'name': name, 'level': level, 'parent_id': parent_id, 'parents': [parent_id], 'new_id': cnt } cnt += 1 for subject_id in data: while True: last_parent = data[subject_id]['parents'][-1] if last_parent <= 0: break data[subject_id]['parents'].append(data[last_parent]['parent_id']) dump_json(output_data, data) return data
def dropbox_auth_finish(): try: oauth_result = get_dropbox_auth_flow().finish(request.args) utils.dump_json( _PTH_DB_CLIENT_CONFIG, { _DB_KEY_ACCESS_TOKEN: oauth_result.access_token, "account_id": oauth_result.account_id, "user_id": oauth_result.user_id, }) set_config("dropboxConfigured", True) init_dropbox() return redirect("admin") except BadRequestException as e: return handle_exception(e, 400) except BadStateException as e: # Start the auth flow again. return redirect("dropbox_auth_start") except CsrfException as e: return handle_exception(e, 403) except NotApprovedException as e: # Not approved? Why not? return redirect("admin") except ProviderException as e: #logger.log("Auth error: %s" % (e,)) print("Auth error: %s" % (e, )) return handle_exception(e, 403)
def update_rent_json(rent_json_file): rent_dict = load_json_or_create_if_empty(rent_json_file) counter = 0 for address, attrs_map in rent_dict.items(): # ------------------------- # PARSE BOA # ------------------------- # error in loading/parsing boa url if attrs_map.get('Error'): pass # attributes exists already for address elif attrs_map: pass # // update rent attributes by rendering boa url and parsing # // only when mapping is empty and no error else: print("Updating rent attributes for address: {}".format(address)) attrs_map.update(get_rent_attributes_from_boa(address)) counter += 1 if counter >= TRIGGER_LIMIT: break # ------------------------- # PARSE ZILLOW # ------------------------- if 'zillow' not in attrs_map: attrs_map.update(parse_zestimate_attributes(address, ZILLOW_ATTRS)) dump_json(rent_json_file, rent_dict) return rent_dict
def find_errors_in_stress(filelist_json, out_json=None, exclude_json={}): wavname_to_text = load_json(filelist_json) exclude = load_json(exclude_json) errors = {} for wavname in wavname_to_text: if "_processed" in wavname and wavname not in exclude: modified_text = [] sentence_ok = True for word in wavname_to_text[wavname].split(): if stress_ok(word): modified_text.append(word_uppercase_to_plus(word)) else: sentence_ok = False modified_text.append(word) if wavname in errors: errors[wavname]["errors"].append(word) else: errors[wavname] = {"text": None, "errors" : [word]} if not sentence_ok: errors[wavname]["text"] = " ".join(modified_text) if out_json is not None: dump_json(errors, out_json) print(len(errors), "errors were found in stress marking")
async def merge_auctions(): # get data if not os.path.exists(utils.SUPER_EQUIP_FILE): _, s_data = await SuperScraper.parse() else: s_data = utils.load_json_with_default(utils.SUPER_EQUIP_FILE, default=False) if not os.path.exists(utils.KEDAMA_EQUIP_FILE): _, k_data = KedamaScraper.parse() else: k_data = utils.load_json_with_default(utils.KEDAMA_EQUIP_FILE, default=False) # merge data merged_data = [] for x in s_data: x['type'] = "super" merged_data.append(x) for x in k_data: x['type'] = "kedama" merged_data.append(x) # dump utils.dump_json(merged_data, utils.AUCTION_FILE) return merged_data
def edit_rr_log(message, message_dict=None, roles=None, emotes=None): # load log log_file = utils.REACTION_ROLE_LOG_DIR + str(message.guild.id) + ".json" log = utils.load_json_with_default(log_file, default={}) # add default values (includes channel id because cant fetch message without it) ch_id = str(message.channel.id) m_id = str(message.id) if ch_id not in log: log[ch_id] = {} if m_id not in log[ch_id]: log[ch_id][m_id] = dict(message={}, roles=[], emotes=[]) # edit entry entry = log[str(message.channel.id)][str(message.id)] if message_dict is not None: entry['message'] = message_dict if roles is not None: entry['roles'] = [x.id for x in roles] if emotes is not None: entry['emotes'] = [str(x) for x in emotes] # save log utils.dump_json(log, log_file) return entry
def execute(project_pairs): """ This method performs HDP in parallel :return: """ source, target, count = project_pairs result = seer(source, target) dump_json(result, dir='pickles_downsamp', fname=str(count))
def execute(project_pairs): """ This method performs HDP in parallel :return: """ source, target, count = project_pairs result = tca_plus(source, target, verbose=False, n_rep=30) dump_json(result, dir='json', fname=str(count))
def init_rent_dict_for_missing_address(address_series): rent_json_file = os.path.join(database_path, 'rent.json') rent_dict = load_json_or_create_if_empty(rent_json_file) for address in address_series: if address not in rent_dict: rent_dict[address] = {} dump_json(rent_json_file, rent_dict) return rent_dict
def execute(project_pairs): """ This method performs HDP in parallel :return: """ source, target, count = project_pairs result = tca_plus(source, target, n_rep=30) dump_json(result, dir='json', fname=str(count))
async def scrape(cls): # inits CACHE = utils.load_json_with_default(utils.SUPER_CACHE_FILE, default=cls.DEFAULT_CACHE) async with get_session() as session: # check for new auctions home_html = await get_html(cls.HOME_BASE_LINK, session) home_soup = BeautifulSoup(home_html, 'html.parser') rows = home_soup.find("tbody").find_all("tr") auc_names = [ r.find("a", href=lambda x: x and "itemlist" in x)['href'] for r in rows ] auc_nums = [r.find("td").get_text().zfill(3) for r in rows] auc_dates = [r.find_all("td")[1].get_text() for r in rows] auc_dates = [cls._to_epoch(x) for x in auc_dates] assert len(auc_names) == len(auc_nums) == len(auc_dates) # get uncached pages new_aucs = [] for i in range(len(rows)): if auc_names[i] not in CACHE['seen']: new_aucs.append((auc_nums[i], auc_names[i], auc_dates[i])) # create folder for auction page html if not os.path.exists(utils.SUPER_HTML_DIR): os.makedirs(utils.SUPER_HTML_DIR) # pull uncached pages for num, name, date in new_aucs: out_path = utils.SUPER_HTML_DIR + name + ".html" if not os.path.exists(out_path): await asyncio.sleep(cls.SCRAPE_DELAY) auc_html = await get_html(cls.HOME_BASE_LINK + name, session) if "Auction ended" not in auc_html: continue # ignore ongoing with open(out_path, "w", encoding='utf-8') as f: f.write(auc_html) tmp = name.replace("itemlist", "") CACHE['seen'].append(name) CACHE['num_map'][tmp] = num CACHE['time_map'][tmp] = date # update cache if new_aucs: CACHE['seen'].sort(reverse=True) utils.dump_json(CACHE, utils.SUPER_CACHE_FILE) # true if new auctions found return bool(new_aucs)
def train_linear_model(df_train_x, df_train_y, df_test_x, df_test_y, model="ridge", over_sample=50, outputdir="."): train_x = df_train_x.as_matrix() train_y = df_train_y.as_matrix()[:, 0] test_x = df_test_x.as_matrix() test_y = df_test_y.as_matrix()[:, 0] print("train x and y shape: ", train_x.shape, train_y.shape) print("test x and y shape: ", test_x.shape, test_y.shape) train_x, train_y = over_sample_train_data(train_x, train_y, threshold=4.5, over_sample=over_sample) if model.lower() == "ridge": info = train_ridge_linear_model(train_x, train_y, test_x, sample_weight=None) elif model.lower() == "lasso": info = train_lasso_model(train_x, train_y, test_x) elif model.lower() == "en": info = train_EN_model(train_x, train_y, test_x) else: raise ValueError("Error in model name: %s" % model) print("test_y and test_y_pred: ", test_y.shape, info["y"].shape) _mse = mean_squared_error(test_y, info["y"]) _std = np.std(test_y - info["y"]) print("MSE on test data: %f" % _mse) print("std of error on test data: %f" % _std) print("np mse: %f" % (((test_y - info["y"])**2).mean())) file_prefix = "%s.over_sample_%d" % (model, over_sample) figname = os.path.join(outputdir, "%s.png" % file_prefix) print("save figure to file: %s" % figname) plot_y(train_y, info["train_y"], test_y, info["y"], figname=figname) content = { "features": list(df_train_x.columns), "coef": list(info["coef"]), "train_y": list(train_y), "train_y_pred": list(info["train_y"]), "test_y": list(test_y), "test_y_pred": list(info["y"]), "MSE": _mse, "error_std": _std } outputfn = os.path.join(outputdir, "%s.json" % file_prefix) print("save results to file: %s" % outputfn) dump_json(content, outputfn)
def parse_all(): cats = read_json(data_dir + 'cat.json') for cat_title, cat_url in cats.items(): styles = read_json(data_dir + cat_title + '.json') for title, url in styles.items(): filepath = data_dir + cat_title + '_' + title + '.json' if not os.path.exists(filepath): style_title, desc = parse_desc(URL + url) dump_json(desc, filepath)
def make_dataset(owner, repo, dataset_dir, labels): issues = get_all_issues_in_repo(owner, repo, labels=labels) commits = get_all_commits_in_repo(owner, repo) data = { 'issues': issues, 'commits': commits, } dump_path = os.path.join(dataset_dir, f'{owner}_{repo}.json') dump_json(data, dump_path)
def dropbox_deauth(): # stop updating from dropbox _DB_CLIENT.stop() # write empty config utils.dump_json(_PTH_DB_CLIENT_CONFIG, {}) set_config(_CONFIG_KEY_DROPBOX_CONFIGURED, False) return redirect("admin")
def get_no_stress_filelist(filelist_json, out_json): wavname_to_text = load_json(filelist_json) no_stress_wavname_to_text = {} for wavname in wavname_to_text: if "_processed" not in wavname: no_stress_wavname_to_text[wavname] = wavname_to_text[wavname] print(len(no_stress_wavname_to_text), "wavs without stress out of", len(wavname_to_text)) dump_json(no_stress_wavname_to_text, out_json)
def main(): config = read_json("config.json") if "client_id" not in config: raise ValueError("No client id in config.json.") if "device_code" not in config: config = get_device_code(config) if "authorization_code" not in config: config = get_authorization_code(config) dump_json(config, "config.json") print("Authorization ready.")
def _add_instance(server): """ Given a server dict from the layout, merge configuration from the YAML into the defaults provided in defaults.py. Returns a dict that looks like {label: data} """ try: label = server['label'] except KeyError: raise ConfigurationError('Server without a label encountered! ' 'Data was:\n' '{}'.format(utils.dump_yaml(server))) # Apply extra config from yaml file extra_info = server.get('instance_info', {}) filtered_info = _filter_info(extra_info) instance_data = utils.update_recursive(DEFAULT_INSTANCE, filtered_info) log.debug(utils.dump_json(instance_data)) # Logic for special cases az = extra_info.get('availability_zone', 'us-east-1a') instance_data['subnet_id'] = \ '${{terraform_remote_state.subnets.output.{}}}'.format(az) instance_data['tags'].update({'label': label}) instance_data['tags'].update({'id': label + '_${var.run_id}'}) ebs_data = _get_ebs_block_devices(extra_info) log.debug(utils.dump_json(ebs_data)) instance_data.update(ebs_data) provisioners = _get_provisioners(extra_info, label) log.debug(utils.dump_json(provisioners)) instance_data.update(provisioners) security_groups = _get_instance_security_groups(extra_info) log.debug(utils.dump_json(security_groups)) instance_data.update(security_groups) return {label: instance_data}
def migrate(self): """ This will migrate an existing melange database to a new quark database. Below melange is referred to as m and quark as q. """ totes = 0.0 flush_db() totes += self.do_and_time("migrate networks, " "subnets, routes, and ips", self.migrate_networks) totes += self.do_and_time("migrate ports", self.migrate_interfaces) totes += self.do_and_time("associating ips with ports", self.associate_ips_with_ports) totes += self.do_and_time("migrate macs and ranges", self.migrate_macs) totes += self.do_and_time("migrate policies", self.migrate_policies) totes += self.do_and_time("commit changes", self.migrate_commit) self.log.info("TOTAL: {0:.2f} seconds.".format(totes)) dump_json(self.json_data)
def execute(self, command, params): """Send a command to the remote server. Any path subtitutions required for the URL mapped to the command should be included in the command parameters. Args: command - A string specifying the command to execute. params - A dictionary of named parameters to send with the command as its JSON payload. """ command_info = self._commands[command] assert command_info is not None, 'Unrecognised command %s' % command data = utils.dump_json(params) path = string.Template(command_info[1]).substitute(params) url = '%s%s' % (self._url, path) return self._request(url, method=command_info[0], data=data)
return misfits def construct_filelist(base, eventlist): filelist = [] for event in eventlist: filelist.append(os.path.join(base, "%s.adjoint.misfit.json" % event)) return filelist if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-f', action='store', dest='event_file', required=True, help="event list file") args = parser.parse_args() base = "/lustre/atlas/proj-shared/geo111/rawdata/asdf/adjsrc/sum" eventlist = read_txt_into_list(args.event_file) print("Number of event: %d" % len(eventlist)) filelist = construct_filelist(base, eventlist) print("filelist: %s" % filelist) misfits = sum_adjoint_misfits(filelist) outputfile = os.path.join(base, "adjoint_misfit.summary.json") print("output json file: %s" % outputfile) dump_json(misfits, outputfile)
import utils from timer import Timer myTimer = Timer() myMiner = Miner() myTimer.start_timer() print "\nMining for negative patterns\n" data = utils.load_json("Train/neg_pos.txt") for index, full_text in data.iteritems(): myMiner.mine_text(full_text) utils.dump_json(myMiner.found,"Train/negative_POS_dict.json") print "\nDone mining for negative patterns\n" myMiner.reset() print "\nMining for positive patterns\n" data = utils.load_json("Train/pos_pos.txt") for index, full_text in data.iteritems(): myMiner.mine_text(full_text) utils.dump_json(myMiner.found,"Train/positive_POS_dict.json") print "\nDone mining for positive patterns\n"
def write_data(data, dest): with open(dest, 'w') as f: f.write(utils.dump_json(data))