def add_game(self, game: Game): """Adds game to the game pool of the current generation. Args: game: Game to be added to the game pool. """ gen = self.current_generation pool_file_path = self.__current_gen_pool_folder_path + 'pool_' + str( os.getpid()) + '_' + str(gen) + '.npz' if not Path(pool_file_path).is_file(): logging.info( "Pool file does not exist. Creating new pool file now...") save_as_file(pool_file_path, np.empty((0, 2), dtype='U51')) # try 10 times to add the game to the game pool trials = 10 for i in range(trials): try: append_to_file(pool_file_path, game) return except Exception: logging.warning( "Couldn't add game to game pool. Trying again...") i += 1 time.sleep(2) raise Exception
def _create_submission_file(batch_size, features_lookup, batch_predict_func): append_to_file(["index,sameArtist\n"], SUBMISSION_FILE) for batch in read_lines_in_batches(SUBMISSION_INFO_FILE, batch_size): y_pred, indices = batch_predict_func(batch, features_lookup) lines = ["{:s},{:f}\n".format(i, p) for i, p in zip(indices, y_pred)] append_to_file(lines, SUBMISSION_FILE)
def test_append_to_file(self): test_data = "some wicked cool stuff" test_data2 = "even more test data" utils.write_to_file(self.test_file, test_data) utils.append_to_file(self.test_file, test_data2) with open(self.test_file, "r") as f: self.assertEqual(f.read(), test_data + test_data2)
def add_question(question): data = {} data['id'] = question['id'] data['submission_time'] = date.strftime("%c") data['view_number'] = 0 data['vote_number'] = 0 data['title'] = question['title'] data['message'] = question['message'] data['image'] = '' utils.append_to_file(questions_data, data, QUESTION_HEADER)
def validate_one_epoch(self, validation_data, epoch): accuracy, avg_valid_loss = self.evaluator.eval_accuracy_and_loss( self.model, validation_data) validation_info = '%20s [%d] validation accuracy: %.4f; loss: %.4f' % \ (self.model_type, epoch + 1, accuracy, avg_valid_loss) metrics = self.evaluator.eval(self.model, self.model_type, validation_data) full_validation_info = format_list_to_string( ['validation', metrics[:-1]], '\t') accuracy = metrics[0] print validation_info append_to_file(self.valid_log_file, full_validation_info) return accuracy
def _average_write_next_batch(lines_gens, weights): separated_lines = [next(lg) for lg in lines_gens] merged_lines = zip(*separated_lines) result_lines = [] for same_example_lines in merged_lines: example_index = same_example_lines[0][0] preds = [float(l[1]) for l in same_example_lines] pred_avg = sum(w * p for w, p in zip(weights, preds)) / sum(weights) result_lines.append("{:s},{:f}\n".format(example_index, pred_avg)) append_to_file(result_lines, SUBMISSION_FILE)
def check_mentions(self): """Checks mentions for sign up's via email or twitter via "Sign up / Sign up [email]""" try: mentions = self.api.mentions_timeline(count=3) for mention in mentions: if "stop" in mention.text.lower(): # Unsubscribe for email if len(mention.text.split()) == 3: email = mention.text.split()[2] email_list = utils.open_file(EMAILS).split() if email in email_list: email_list.remove(email) utils.write_to_file(EMAILS, ' '.join(email_list)) # Unsubscribe for Twitter handle else: twitter_name = mention.user.screen_name twitter_name_list = utils.open_file( TWITTER_NAMES).split() if twitter_name in twitter_name_list: twitter_name_list.remove(twitter_name) utils.write_to_file(TWITTER_NAMES, ' '.join(twitter_name_list)) elif "sign up" in mention.text.lower(): # Email sign up if len(mention.text.split()) > 3: email = mention.text.split()[3] email_list = utils.open_file(EMAILS).split() if email not in email_list: email_list.append(email) utils.append_to_file(EMAILS, email) # Twitter handle sign up else: twitter_name = mention.user.screen_name twitter_name_list = utils.open_file( TWITTER_NAMES).split() if twitter_name not in twitter_name_list: twitter_name_list.append(twitter_name) utils.append_to_file(TWITTER_NAMES, twitter_name) except tweepy.TweepError as error: utils.write_to_log(f'Error checking mentions: {error}')
def _average_submission_files(): lines_gens, weights = [], [] for file_name, weight in FILES_TO_AVG.items(): file_path = join(DATA_DIR, file_name) lines_gen = read_lines_in_batches(file_path, batch_size=BATCH_SIZE) lines_gens.append(lines_gen) weights.append(weight) append_to_file(["index,sameArtist\n"], SUBMISSION_FILE) while True: try: _average_write_next_batch(lines_gens, weights) except StopIteration: return
def add_answer(answer, question_id): data = {} answers = utils.open_file(answers_data, ANSWER_HEADER) if len(answers) == 0: iddd = 1 else: idd = answers[len(answers)-1]['id'] id_int = int(idd) iddd= id_int+1 data['id'] = iddd data['submission_time'] = date.strftime("%c") data['vote_number'] = 0 data['question_id'] = question_id data['message'] = answer data['image'] = "" utils.append_to_file(answers_data, data, ANSWER_HEADER)
def generate_data_from_file(self, file_name, output_folder, should_subtokenize=False): """ Appends the features extracted from methods located in the given file to the output files method-names.txt, method-apis.txt, method-tokens.txt and javadoc.txt. These files are located in the given output folder. This function returns the number of methods written. """ name_file = output_folder + '/' + METHOD_NAME_FILE_NAME api_file = output_folder + '/' + METHOD_API_FILE_NAME tokens_file = output_folder + '/' + METHOD_TOKENS_FILE_NAME javadoc_file = output_folder + '/' + JAVADOC_FILE_NAME tokens, api_calls, names, javadocs, _body = self.parse_file(file_name, should_subtokenize=should_subtokenize) if len(tokens) > 0 and len(api_calls) > 0 and len(names) > 0 and len(javadocs) > 0: append_to_file(tokens, tokens_file) append_to_file(api_calls, api_file) append_to_file(names, name_file) append_to_file(javadocs, javadoc_file) return len(tokens) return 0
def migrate_repo(path): """Perform migration to repo on given path.""" click.secho(f"\n>>> Migrating {path}...", fg="green") repo = path.split("/")[-1] repo_underscores = repo.replace("-", "_") if path[-1] != "/": path += "/" # pypi-publish.yml travis_url = ( f"https://raw.githubusercontent.com/inveniosoftware/{repo}/master/.travis.yml" ) travis = read_yaml_from_url(travis_url) if travis and travis.get("deploy", {}).get("provider") == "pypi": has_compile_catalog = "compile_catalog" in travis.get( "deploy", {}).get("distributions") logging.info( f"Has `compile_catalog` in travis.yml?: {has_compile_catalog}") render_and_copy_template( "pypi-publish.yml", {"has_compile_catalog": has_compile_catalog}, f"{path}.github/workflows", ) # .editorconfig replace_simple(path + ".travis.yml", ".github/workflows/*.yml", ".editorconfig") # README.rst replace_regex( r"https:\/\/img\.shields\.io\/travis\/([a-z]*\/[a-z-]*)\.svg", "https://github.com/\\1/workflows/CI/badge.svg", path + "README.rst", ) replace_regex( r"https:\/\/travis-ci\.org\/([a-z]*\/[a-z-]*)", "https://github.com/\\1/actions?query=workflow%3ACI", path + "README.rst", ) # CONTRIBUTING.rst replace_regex( r"https:\/\/travis-ci\.(org|com)\/([a-z]*\/[a-z-]*)\/pull_requests", "https://github.com/\\2/actions?query=event%3Apull_request", path + "CONTRIBUTING.rst", ) # tests.yaml build_template(repo, "tests.yml", dest_path=f"{path}.github/workflows") # run-tests.sh build_template(repo, "run-tests.sh", dest_path=path) # pytest.ini delete_line("pep8ignore", path + "pytest.ini") replace_regex( "(addopts =).*", f'\\1 --isort --pydocstyle --pycodestyle --doctest-glob="*.rst" --doctest-modules --cov={repo_underscores} --cov-report=term-missing', path + "pytest.ini", ) if not file_contains("testpaths", path + "pytest.ini"): append_to_file(f"testpaths = tests {repo_underscores}", path + "pytest.ini") # Add .github/workflows *.yml to MANIFEST.in add_line("recursive-include .github/workflows *.yml\n", path + "MANIFEST.in") # Delete travis file delete_file(path + ".travis.yml") # Upgrade from any Sphinx version to 3 in setup.py replace_regex( # Quote - package name, version (separated with commas) range - Quote r"(\"|')(Sphinx.*)(\"|')", "Sphinx>=3", path + "setup.py", # Replace the second matching group only (excludes the outer quotes) 2) # Simplify setup.py test requirements replacing them with pytest-invenio replace_list( path + "setup.py", r"tests_require = (['\"\'[\s*\"(a-z-A-Z><=0-9.\[\]),]*])", [ # Remove packages already installed by pytest-invenio "check-manifest", "coverage", "docker-services-cli", "pytest-celery", "pytest-cov", "pytest-flask", "pytest-isort", "pytest-pycodestyle", "pytest-pydocstyle", "pydocstyle", "pytest", "selenium", # pytest-pep8 is replaced by pytest-pycodestyle "pytest-pep8", # pytest-pep8 is replaced by pytest-isort "isort", ], ["pytest-invenio>=1.4.0"], "tests_require", ) # Remove bak files delete_file(path + "*.bak")
def write_train_loss(self, epoch, avg_train_loss): loss_info = '%20s [%d] training loss: %.4f' % \ (self.model_type, epoch + 1, avg_train_loss) print loss_info append_to_file(self.train_log_file, loss_info)
def convert_to_lattice(self, wordvec_dict, subword_embedding, dst_dir, log, dec_tree, ignore_time_seg, processed_file_list_path=None, embed_apostrophe=False, keep_pronunciation=True, uniform_subword_durations=False): """Convert confusion network object to lattice `.npz` format.""" oov = set() utils.mkdir(dst_dir) if ignore_time_seg != False: ignore_time_seg_dict = np.load(ignore_time_seg) ignore_time_seg_dict = ignore_time_seg_dict['ignore'][()] else: ignore_time_seg_dict = False topo_order = list(range(self.num_sets + 1)) cum_sum = np.cumsum([0] + self.num_arcs) assert cum_sum[-1] == len(self.cn_arcs), "Wrong number of arcs." edge_data = [] start_times = [] grapheme_data = [] child_2_parent = {} parent_2_child = {} ignore = [] use_dec_tree = False if dec_tree != 'NONE': with open(dec_tree, 'rb') as dec_tree: decision_tree = pickle.load(dec_tree) use_dec_tree = True else: use_dec_tree = False for i in range(self.num_sets): parent_2_child[i] = { i + 1: list(range(cum_sum[i], cum_sum[i + 1])) } child_2_parent[i + 1] = { i: list(range(cum_sum[i], cum_sum[i + 1])) } for j in range(cum_sum[i], cum_sum[i + 1]): edge_info = self.cn_arcs[j] if edge_info[0] == '!NULL': wordvec = np.zeros_like(wordvec_dict['<hes>']) else: if edge_info[0] in wordvec_dict: wordvec = wordvec_dict[edge_info[0]] else: oov.add(edge_info[0]) if log: if use_dec_tree: conf = np.exp(edge_info[3]) conf = decision_tree.conv_value(conf) conf = np.log(conf) else: conf = edge_info[3] edge_vec = np.concatenate( (wordvec, np.array([edge_info[2] - edge_info[1], conf ])), axis=0) else: if use_dec_tree: conf = np.exp(edge_info[3]) conf = decision_tree.conv_value(conf) else: conf = np.exp(edge_info[3]) edge_vec = np.concatenate( (wordvec, np.array([edge_info[2] - edge_info[1], conf ])), axis=0) start_times.append(edge_info[1]) edge_data.append(edge_vec) if edge_info[0] in ['<s>', '</s>', '!NULL', '<hes>']: ignore.append(j) elif ignore_time_seg != False: file_name = self.name name = file_name.split("_")[0] start_frame = file_name.split("_")[-2] start_frame = float(start_frame) / 100 start_time = edge_info[1] + start_frame end_time = edge_info[2] + start_frame if name in ignore_time_seg_dict: for tup in ignore_time_seg_dict[name]: if start_time > tup[0] and end_time < tup[1]: ignore.append(j) # Deal with any grapheme data if required: if self.has_graphemes: grapheme_feature_array = utils.get_grapheme_info( grapheme_info=self.cn_arcs[i][4], subword_embedding_dict=subword_embedding, apostrophe_embedding=embed_apostrophe, keep_pronunciation=keep_pronunciation, uniform_durations=uniform_subword_durations) grapheme_data.append(grapheme_feature_array) npz_file_name = os.path.join(dst_dir, self.name + '.npz') if self.has_graphemes: # go through the array now and put it in a big masked array so it is just ine simple numpy array (I, J, F) max_grapheme_seq_length = utils.longest_grapheme_sequence( grapheme_data) padded_grapheme_data = np.empty( (len(grapheme_data), max_grapheme_seq_length, utils.len_subword_features())) mask = np.empty_like(padded_grapheme_data, dtype=bool) for arc_num, grapheme_seq in enumerate(grapheme_data): padded_grapheme_data[arc_num, :, :], mask[ arc_num, :, :] = utils.pad_subword_sequence( grapheme_seq, max_grapheme_seq_length) masked_grapheme_data = ma.masked_array(padded_grapheme_data, mask=mask, fill_value=-999999) np.savez(npz_file_name, topo_order=topo_order, child_2_parent=child_2_parent, parent_2_child=parent_2_child, edge_data=np.asarray(edge_data), ignore=ignore, grapheme_data=masked_grapheme_data, start_times=start_times) else: np.savez(npz_file_name, topo_order=topo_order, child_2_parent=child_2_parent, parent_2_child=parent_2_child, edge_data=np.asarray(edge_data), ignore=ignore, start_times=start_times) if processed_file_list_path is not None: utils.append_to_file(os.path.abspath(npz_file_name), processed_file_list_path) return oov
def learn(solver_path, snapshot_path, iters_to_init, max_samples_to_use): net_path = proto.get_net_from_solver(solver_path) train_db_path = proto.get_db_from_net(net_path) train_db_len = db.size(train_db_path) # prepare path to the temporary model files active_solver_path = utils.create_temp_path(solver_path + config.POSTFIX) active_net_path = utils.create_temp_path(net_path + config.POSTFIX) active_db_path = utils.create_temp_path(train_db_path + config.POSTFIX) # prepare temporary model files proto.prepare_net(net_path, active_net_path, active_db_path) snapshot_prefix, snapshot_iter = proto.prepare_solver( solver_path, active_solver_path, active_net_path, snapshot_path, iters_to_init ) print snapshot_prefix # recover the snapshot folder snapshot_path = '/'.join(snapshot_prefix.split('/')[:-1]) epoch_file = os.path.join(snapshot_path, config.EPOCH_FILE) # deploy net # deploy_net = net.Net(active_net_path, output_layers=config.OUTPUT_LAYERS) deploy_net = net.DropoutNet(active_net_path, config.DROPOUT_ITERS, aggregate='mean', output_layers=config.OUTPUT_LAYERS, output_processor=utils.softmax) epoch_used_samples = set() dataset = samples.Dataset(train_db_path, deploy_net.batch_size, epoch_used_samples) # initialize net # solverstate_path = proto.solverstate_path(snapshot_prefix, iters_to_init) # if not os.path.exists(solverstate_path): if os.path.exists(active_db_path): shutil.rmtree(active_db_path) # shutil.copytree(train_db_path, active_db_path) used_samples = db.extract_samples(train_db_path, active_db_path, iters_to_init * deploy_net.batch_size) init_network(active_solver_path) # do the real learning print 'train samples:', train_db_len for epoch in xrange(config.MAX_EPOCHS): print 'Epoch #{0}'.format(epoch) epoch_used_samples.clear() epoch_used_samples.update(used_samples) while len(epoch_used_samples) < train_db_len: if snapshot_iter > config.MAX_ITER: break solverstate_path = proto.solverstate_path(snapshot_prefix, snapshot_iter) caffemodel_path = proto.caffemodel_path(snapshot_prefix, snapshot_iter) print 'Using snapshot iter #{0}'.format(snapshot_iter) deploy_net.load_model(caffemodel_path) # active_samples = samples.choose_active(deploy_net, dataset, config.BATCHES_PER_RUN) # epoch_used_samples.update(active_samples) # assert len(active_samples) <= int(max(active_samples)), \ # 'Index of the highest sample is lower than the number of used samples' # # check if it makes sense to continue # iters_to_do = len(active_samples) / deploy_net.batch_size # if iters_to_do == 0: # break num_samples_to_choose = min(max_samples_to_use - len(epoch_used_samples), config.NEW_SAMPLES_PER_ITER) batches_to_choose = num_samples_to_choose / deploy_net.batch_size chosen_samples = samples.choose_active(deploy_net, dataset, batches_to_choose) active_samples = chosen_samples + list(epoch_used_samples) epoch_used_samples.update(chosen_samples) print 'Used {} samples'.format(len(epoch_used_samples)) # check if it makes sense to continue iters_to_do = len(active_samples) / deploy_net.batch_size if iters_to_do == 0: break db.extract_samples(train_db_path, active_db_path, active_samples) proto.increase_max_iters(active_solver_path, iters_to_do) train_network(active_solver_path, solverstate_path) snapshot_iter += iters_to_do utils.append_to_file(epoch_file, '{}:{}'.format(snapshot_iter, len(epoch_used_samples)))
def process_one_lattice(lattice_path, dst_dir, wordvec, subword_embedding, embed_apostrophe, uniform_subword_durations=False, processed_file_list_path=None): """ Process and save a lattice into *.npz format Arguments: lattice_path: String containing the absolute path to lattices `.lat.gz` dst_dir: Absolute path to destination directory as a string wordvec: The word vector dictionary obtained by calling `load_wordvec` subword_embedding: Dictionary with subword embeddings embed_apostrophe: Boolean indicator of whether to embed apostrophes separately. """ name = lattice_path.split('/')[-1].split('.')[0] + '.npz' print('Processing {}'.format(name)) try: LOGGER.info(name) name = os.path.join(dst_dir, name) if not os.path.isfile(name): nodes, edges, dependency, child_2_parent, parent_2_child, grapheme_data \ = read_lattice(lattice_path, subword_embedding, embed_apostrophe) topo_order = toposort_flatten(dependency) # for each edge, the information contains # [EMBEDDING_LENGTH, duration(1), AM(1), LM(1), arc_posterior(1)] edge_data = np.empty( (len(edges), EMBEDDING_LENGTH + 1 + 1 + 1 + 1)) start_times = [] ignore = [] for i, edge in enumerate(edges): start_node = edge[0] start_times.append(nodes[start_node][0]) end_node = edge[1] time = nodes[end_node][0] - nodes[start_node][0] word = nodes[end_node][1] if word in wordvec: edge_data[i] = np.concatenate( (wordvec[word], np.array([time, edge[2], edge[3], edge[4]])), axis=0) else: edge_data[i] = np.concatenate( (np.zeros(EMBEDDING_LENGTH), np.array([time, edge[2], edge[3], edge[4]])), axis=0) LOGGER.info('OOV word: {}\n'.format(word)) utils.append_to_file(word, 'oov.txt') if word in ['<s>', '</s>', '!NULL', '<hes>']: ignore.append(i) # save multiple variables into one .npz file np.savez(name, topo_order=topo_order, child_2_parent=child_2_parent, parent_2_child=parent_2_child, edge_data=edge_data, ignore=ignore, grapheme_data=grapheme_data, start_times=start_times) if processed_file_list_path is not None: utils.append_to_file(os.path.abspath(name), processed_file_list_path) except OSError as exception: LOGGER.info('%s\n' % lattice_path + str(exception))
def migrate_repo(path): """Perform migration to repo on given path.""" click.secho(f"\n>>> Migrating {path}...", fg="green") repo = path.split("/")[-1] repo_underscores = repo.replace("-", "_") # TODO: add the trailing slash only if needed path = path + "/" # Reference: https://codimd.web.cern.ch/TOOkF5yhSAKJq3TiY0L42A?view travis = read_yaml(path + ".travis.yml") try: if travis["deploy"]["provider"] == "pypi": # Download pypi-publish.yml template download_file( GA_PYPI_PUBLISH_YAML_URL, path + ".github/workflows/pypi-publish.yml", ) except Exception as e: logging.info(f"Couldn't find deploy key in .travis.yml") # .editorconfig replace_simple(path + ".travis.yml", ".github/workflows/*.yml", ".editorconfig") # README.rst replace_regex( r"https:\/\/img\.shields\.io\/travis\/([a-z]*\/[a-z-]*)\.svg", "https://github.com/\\1/workflows/CI/badge.svg", path + "README.rst", ) replace_regex( r"https:\/\/travis-ci\.org\/([a-z]*\/[a-z-]*)", "https://github.com/\\1/actions?query=workflow%3ACI", path + "README.rst", ) # CONTRIBUTING.rst replace_regex( r"https:\/\/travis-ci\.(org|com)\/([a-z]*\/[a-z-]*)\/pull_requests", "https://github.com/\\2/actions?query=event%3Apull_request", path + "CONTRIBUTING.rst", ) # tests.yaml build_template(repo, "tests.yml", path=f"{path}/.github/workflows") # run-tests.sh build_template(repo, "run-tests.sh", path=path) # pytest.ini delete_line("pep8ignore", path + "pytest.ini") replace_regex( "(addopts =).*", f'\\1 --isort --pydocstyle --pycodestyle --doctest-glob="*.rst" --doctest-modules --cov={repo_underscores} --cov-report=term-missing', path + "pytest.ini", ) if not file_contains("testpaths", path + "pytest.ini"): append_to_file(f"testpaths = tests {repo_underscores}", path + "pytest.ini") # Add .github/workflows *.yml to MANIFEST.in add_line("recursive-include .github/workflows *.yml\n", path + "MANIFEST.in") # Delete travis file delete_file(path + ".travis.yml") # Upgrade Sphinx 1 to 3 in setup.py replace_regex( r"Sphinx>=1.[0-9].[0-9]", "Sphinx>=3", path + "setup.py", ) # Simplify setup.py test requirements replacing them with pytest-invenio replace_list( path + "setup.py", r"tests_require = (['\"\'[\s*\"(a-z-A-Z><=0-9.\[\]),]*])", [ # Remove packages already installed by pytest-invenio "check-manifest", "coverage", "docker-services-cli", "pytest-celery", "pytest-cov", "pytest-flask", "pytest-isort", "pytest-pycodestyle", "pytest-pydocstyle", "pydocstyle", "pytest", "selenium", # pytest-pep8 is replaced by pytest-pycodestyle "pytest-pep8", # pytest-pep8 is replaced by pytest-isort "isort", ], ["pytest-invenio>=1.4.0"], "tests_require", ) # Remove bak files delete_file(path + "*.bak")
def _save_info_about_model(self): append_to_file(self.config.results_file_path, f'{self.unique_id}, {self.config.name}, {self.f1_score()}') metrics = '\n'.join(self.f1_score_callback.metrics) append_to_file(self._unique_model_directory() + '/meta.txt', metrics)
def _save_configuration(self): append_to_file(self._unique_model_directory() + '/model.config', f'{self.config}')