def run_actor( agent: Agent, rng_key: jnp.ndarray, get_params: Callable[[], hk.Params], enqueue_traj: Callable[[Transition], None], unroll_len: int, num_trajectories: int, ): """Runs an actor to produce num_trajectories trajectories.""" env = catch.Catch() state = env.reset() traj = [] for i in range(num_trajectories): params = get_params() # The first rollout is one step longer. for _ in range(unroll_len + int(i == 0)): rng_key, step_key = jax.random.split(rng_key) state = preprocess_step(state) action, logits = agent.step(params, step_key, state) transition = Transition(state, action, logits) traj.append(transition) state = env.step(action) if state.step_type == dm_env.StepType.LAST: logging.log_every_n(logging.INFO, 'Episode ended with reward: %s', 5, state.reward) # Stack and send the trajectory. stacked_traj = jax.tree_multimap(lambda *ts: np.stack(ts), *traj) enqueue_traj(stacked_traj) # Reset the trajectory, keeping the last timestep. traj = traj[-1:]
def _add_tables(input_dir, interaction_dict): """Adds table protos to all interactions.""" table_files = set() for interactions in interaction_dict.values(): for interaction in interactions: table_files.add(interaction.table.table_id) table_dict = {} for index, table_file in enumerate(sorted(table_files)): logging.log_every_n(logging.INFO, 'Read %4d / %4d table files', 100, index, len(table_files)) table_path = os.path.join(input_dir, table_file) with open(table_path, "r") as table_handle: table = Table() rows = list(csv.reader(table_handle)) headers, rows = rows[0], rows[1:] for header in headers: table.columns.add().text = header for row in rows: new_row = table.rows.add() for cell in row: new_row.cells.add().text = cell table.table_id = table_file table_dict[table_file] = table for interactions in interaction_dict.values(): for interaction in interactions: interaction.table.CopyFrom(table_dict[interaction.table.table_id])
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') flags.mark_flag_as_required('input_file') flags.mark_flag_as_required('input_format') flags.mark_flag_as_required('output_file') flags.mark_flag_as_required('label_map_file') flags.mark_flag_as_required('vocab_file') flags.mark_flag_as_required('saved_model') label_map = utils.read_label_map(FLAGS.label_map_file) converter = tagging_converter.TaggingConverter( tagging_converter.get_phrase_vocabulary_from_label_map(label_map), FLAGS.enable_swap_tag) builder = bert_example.BertExampleBuilder(label_map, FLAGS.vocab_file, FLAGS.max_seq_length, FLAGS.do_lower_case, converter) predictor = predict_utils.LaserTaggerPredictor( tf.contrib.predictor.from_saved_model(FLAGS.saved_model), builder, label_map) num_predicted = 0 with tf.gfile.Open(FLAGS.output_file, 'w') as writer: for i, (sources, target) in enumerate(utils.yield_sources_and_targets( FLAGS.input_file, FLAGS.input_format)): logging.log_every_n( logging.INFO, f'{i} examples processed, {num_predicted} converted to tf.Example.', 100) prediction = predictor.predict(sources) writer.write(f'{" ".join(sources)}\t{prediction}\t{target}\n') num_predicted += 1 logging.info(f'{num_predicted} predictions saved to:\n{FLAGS.output_file}')
def get_answer_indexes( prediction, cell_classification_threshold, ): """Computes answer indexes.""" input_ids = prediction["input_ids"] span_indexes = prediction.get("span_indexes") span_logits = prediction.get("span_logits") if span_indexes is not None and span_logits is not None: best_logit, best_span = max(zip(span_logits, span_indexes.tolist()), ) logging.log_every_n( logging.INFO, "best_span: %s, score: %s", 500, best_span, best_logit, ) return [input_ids[i] for i in range(best_span[0], best_span[1] + 1)] answers = [] for i, prob in get_cell_token_probs(prediction): if prob > cell_classification_threshold: answers.append(input_ids[i]) return answers
async def log_kafka_sample_async(self, topic: str, sample: dict): """logs sample to Kafka topic asynchronously Sample for monitoring purpose: Supports logging samples to Kafka via REST API (Confluent) Column guidelines: time: epoch in seconds sample_rate: pre-sampled record shall set this to sample rate, e.g., 100 means one sample is logged out of 100 column type shall be log int, str, or vector of str """ if self.cluster_config.MONITORING.KAFKA_REST_ADDRESS == "": return url = "http://{}/topics/{}".format( self.cluster_config.MONITORING.KAFKA_REST_ADDRESS, topic) try: record_data = json.dumps({"records": [{"value": sample}]}) headers = { "Content-Type": "application/vnd.kafka.json.v2+json", "Accept": "application/vnd.kafka.v2+json", } session = aiohttp.ClientSession() response = await session.post(url, data=record_data, headers=headers) if response.status != 200: raise Exception("non-OK response status code: {}".format( response.status_code)) except Exception as ex: GLOG.log_every_n(GLOG.ERROR, "Failed to log sample to Kafka: %s", 100, ex) finally: await session.close()
def gym_env_wrapper(env, rl_env_max_episode_steps, maxskip_env, rendered_env, rendered_env_resize_to, sticky_actions, output_dtype, num_actions): """Wraps a gym environment. see make_gym_env for details.""" # rl_env_max_episode_steps is None or int. assert ((not rl_env_max_episode_steps) or isinstance(rl_env_max_episode_steps, int)) wrap_with_time_limit = ((not rl_env_max_episode_steps) or rl_env_max_episode_steps >= 0) if wrap_with_time_limit: env = remove_time_limit_wrapper(env) if num_actions is not None: logging.log_every_n(logging.INFO, "Number of discretized actions: %d", 1, num_actions) env = ActionDiscretizeWrapper(env, num_actions=num_actions) if sticky_actions: env = StickyActionEnv(env) if maxskip_env: env = MaxAndSkipEnv(env) # pylint: disable=redefined-variable-type if rendered_env: env = RenderedEnv(env, resize_to=rendered_env_resize_to, output_dtype=output_dtype) if wrap_with_time_limit and rl_env_max_episode_steps is not None: env = gym.wrappers.TimeLimit( env, max_episode_steps=rl_env_max_episode_steps) return env
def _filter_keyword_group_quality(keyword_group_set): """Check if keywords for image uid group match quality requirement.""" keyword_group_set_tuple = tuple( np.asarray(x) for x in zip(*keyword_group_set)) valid_idx = [] for index, next_keyword in enumerate(keyword_group_set): uid, _, keyword, lemma = next_keyword keyword_idx = np.union1d( np.where(keyword_group_set_tuple[2] == keyword)[0], np.where(keyword_group_set_tuple[3] == lemma)[0]) keyword_idx = np.intersect1d( keyword_idx, np.where(keyword_group_set_tuple[0] == uid)[0]) num_unique_captions = len(set(keyword_group_set_tuple[1][keyword_idx])) if num_unique_captions >= mp.SHARED_ARGS[0]: if "debug" in FLAGS and FLAGS.debug: logging.log_every_n( logging.DEBUG, "Keeping image keyword '{}' which occurs in {} (>= {}) captions" .format(keyword, num_unique_captions, mp.SHARED_ARGS[0]), 1000) valid_idx.append(index) else: if "debug" in FLAGS and FLAGS.debug: logging.log_every_n( logging.DEBUG, "Throwing image keyword '{}' which occurs in {} (< {}) captions" .format(keyword, num_unique_captions, mp.SHARED_ARGS[0]), 1000) return tuple(x[valid_idx] for x in keyword_group_set_tuple)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') flags.mark_flag_as_required('input_file') flags.mark_flag_as_required('input_format') flags.mark_flag_as_required('output_tfrecord') flags.mark_flag_as_required('label_map_file') flags.mark_flag_as_required('vocab_file') label_map = utils.read_label_map(FLAGS.label_map_file) converter = tagging_converter.TaggingConverter( tagging_converter.get_phrase_vocabulary_from_label_map(label_map), FLAGS.enable_swap_tag) builder = bert_example.BertExampleBuilder(label_map, FLAGS.vocab_file, FLAGS.max_seq_length, FLAGS.do_lower_case, converter) num_converted = 0 with tf.io.TFRecordWriter(FLAGS.output_tfrecord) as writer: for i, (sources, target) in enumerate(utils.yield_sources_and_targets( FLAGS.input_file, FLAGS.input_format)): logging.log_every_n( logging.INFO, f'{i} examples processed, {num_converted} converted to tf.Example.', 10000) example = builder.build_bert_example( sources, target, FLAGS.output_arbitrary_targets_for_infeasible_examples) if example is None: continue writer.write(example.to_tf_example().SerializeToString()) num_converted += 1 logging.info(f'Done. {num_converted} examples converted to tf.Example.') count_fname = _write_example_count(num_converted) logging.info(f'Wrote:\n{FLAGS.output_tfrecord}\n{count_fname}')
def convert(in_filename, out_filename): """Converts a recognized genomics file `in_filename` to `out_filename`. Args: in_filename: str; filename of a genomics data file to use as input. out_filename: str; filename of a genomics data file to use as output, or None, if no output should be written. Raises: ConversionError, if the conversion could not be executed. """ reader_class, writer_class = _reader_writer_classes( in_filename, out_filename) reader = reader_class(in_filename) with reader_class(in_filename) as reader: with writer_class(out_filename, header=reader.header) as writer: start = time.time() i = 0 for record in reader: i += 1 writer.write(record) logging.log_every_n(logging.INFO, "Progress: %d records", LOG_EVERY, i) elapsed = time.time() - start logging.info("Done, processed %d records in %0.2f seconds.", i, elapsed)
def feature_model_sparsity_loss(self, lambda1, lambda2, avg1, avg2): if avg1: word_l2_dist = torch.mean(torch.pow(self.word_emb - self.word_emb_cache, 2)) feat_l2_dist = torch.mean(torch.pow(self.feature_emb - self.feature_emb_cache, 2)) else: word_l2_dist = torch.sum(torch.pow(self.word_emb - self.word_emb_cache, 2)) feat_l2_dist = torch.sum(torch.pow(self.feature_emb - self.feature_emb_cache, 2)) # z = F.relu(torch.matmul(self.word_emb, torch.transpose(self.feature_emb, 1, 0)) - self.feature_relu_bias) # b = Bernoulli(F.sigmoid(torch.matmul(self.word_emb, torch.transpose(self.feature_emb, 1, 0)))) # z = b.sample() z = self.compute_z() z_sum = z.sum() # z_gt_0 = torch.zeros_like(z) z_gt_0 = torch.sign(z) z_gt_0_sum = z_gt_0.sum() if avg2: z_sparse = z_sum.sum() else: z_sparse = z_sum.sum(dim=1).mean() loss = lambda1 * word_l2_dist + lambda1 * feat_l2_dist + lambda2 * z_sparse logging.log_every_n(logging.INFO, 'loss %s | word %s | feat %s | z %s | z_sum %s | z > 0 %s', 100, loss.cpu().detach().numpy(), word_l2_dist.cpu().detach().numpy(), feat_l2_dist.cpu().detach().numpy(), z_sparse.cpu().detach().numpy(), z_sum.cpu().detach().numpy(), z_gt_0_sum.cpu().detach().numpy()) # logging.info('z.sum() = %s, (z > 0).sum() = %s bias = %s', z.sum(), (z > 0).sum(), self.feature_relu_bias) return loss
def _get_token_answers( prediction, cell_classification_threshold, ): """Computes answer indexes.""" span_indexes = prediction.get("span_indexes") span_logits = prediction.get("span_logits") if span_indexes is not None and span_logits is not None: best_logit, best_span = max(zip(span_logits, span_indexes.tolist()), ) logging.log_every_n( logging.INFO, "best_span: %s, score: %s", 500, best_span, best_logit, ) return [ _to_token_answer( prediction, best_span[0], best_span[1] + 1, best_logit, ) ] answers = [] answer_begin_index = None answer_end_index = None answer_probs = [] for i, prob in get_cell_token_probs(prediction): if prob > cell_classification_threshold: if answer_end_index is not None: if answer_end_index < i: # There is a gap between the current answer and the new index. answers.append( _to_token_answer( prediction, answer_begin_index, answer_end_index, _geometric_mean(answer_probs), )) answer_begin_index = None answer_end_index = None answer_probs.clear() if answer_begin_index is None: answer_begin_index = i answer_end_index = i + 1 answer_probs.append(prob) if answer_begin_index is not None: answers.append( _to_token_answer( prediction, begin_token_index=answer_begin_index, end_token_index=answer_end_index, score=_geometric_mean(answer_probs), )) return answers
def build_pregrants(): cnx = mysql.connector.connect(option_files=os.path.join( os.environ['HOME'], '.mylogin.cnf'), database='pregrant_publications') cursor = cnx.cursor() query = "SELECT id, document_number, name_first, name_last FROM rawinventor;" cursor.execute(query) feature_map = collections.defaultdict(list) idx = 0 for uuid, document_number, name_first, name_last in cursor: im = InventorMention(uuid, None, '', name_first if name_first else '', name_last if name_last else '', '', '', '', document_number=document_number) feature_map[im.record_id].append(last_name(im)) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s pregrant records - %s features', 10000, idx, len(feature_map)) return feature_map
def convert(self, rng, interaction, random_table): """Creates TF example from interaction.""" question_tokens = self._get_question_tokens(interaction, rng) if random_table is not None and rng.random() < 0.5: is_random_table = True table = random_table else: is_random_table = False if interaction.HasField('table'): table = interaction.table else: table = None if table is None: question_tokens = self._tokenizer.tokenize( interaction.questions[0].original_text) question_tokens = question_tokens[:self._max_seq_length - 1] tokens, segment_ids, column_ids, row_ids = self._serialize_text( question_tokens) else: if not question_tokens: return None if random_table is not None: logging.log_every_n( logging.INFO, 'Table: %s Random Table: %s is_random_table: %s', 500000, interaction.table.table_id, random_table.table_id, is_random_table) token_budget = self._get_token_budget(question_tokens) tokenized_table = self._tokenize_table(table) try: num_columns, num_rows, num_tokens = self._get_table_sizes( token_budget, tokenized_table, rng) except ValueError: return None serialized_example = self._serialize(question_tokens, tokenized_table, num_columns, num_rows, num_tokens) tokens = serialized_example.tokens segment_ids = serialized_example.segment_ids row_ids = serialized_example.row_ids column_ids = serialized_example.column_ids assert len(tokens) <= self._max_seq_length (tokens, masked_lm_positions, masked_lm_labels) = self._create_masked_lm_predictions( interaction, tokens, column_ids, row_ids, rng) instance = TrainingInstance(tokens=tokens, segment_ids=segment_ids, column_ids=column_ids, row_ids=row_ids, masked_lm_positions=masked_lm_positions, masked_lm_labels=masked_lm_labels, is_random_table=is_random_table) return self._to_example(table, instance)
def evaluate(): """Eval happens on GPU or CPU, and evals each checkpoint as it appears.""" candidate_checkpoint = None smurf = build_network(batch_size=1) weights = { 'census': FLAGS.weight_census, 'smooth1': FLAGS.weight_smooth1, 'smooth2': FLAGS.weight_smooth2, } evaluate_fn, _ = smurf_data.make_eval_function(FLAGS.eval_on, FLAGS.height, FLAGS.width, progress_bar=True, plot_dir=FLAGS.plot_dir, num_plots=50, weights=weights) latest_checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) while 1: # Wait for a new checkpoint while candidate_checkpoint == latest_checkpoint: logging.log_every_n( logging.INFO, 'Waiting for a new checkpoint, at %s, latest is %s', 3, FLAGS.checkpoint_dir, latest_checkpoint) time.sleep(45.) candidate_checkpoint = tf.train.latest_checkpoint( FLAGS.checkpoint_dir) candidate_checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) latest_checkpoint = candidate_checkpoint logging.info('New checkpoint found: %s', candidate_checkpoint) # This forces the checkpoint manager to reexamine the checkpoint directory # and become aware of the new checkpoint. smurf.update_checkpoint_dir(FLAGS.checkpoint_dir) smurf.restore() step = tf.compat.v1.train.get_global_step().numpy() terminate = False if step >= FLAGS.num_train_steps: # If initializing from another checkpoint directory, the first checkpoint # will be the init checkpoint and might have steps > num_train_steps. # Don't quit in this case. terminate = True if FLAGS.init_checkpoint_dir: with gfile.Open( os.path.join(FLAGS.checkpoint_dir, 'checkpoint'), 'r') as f: if len(f.readlines()) == 2: logging.info('Continuing evaluation after evaluating ' 'init_checkpoint.') terminate = False eval_results = evaluate_fn(smurf) smurf_plotting.print_eval(eval_results) if terminate or FLAGS.run_eval_once: return
def warning_once(msg, *args): """Generate warning message once Args: msg: str, the message to be logged. *args: The args to be substitued into the msg. """ logging.log_every_n(logging.WARNING, msg, 1 << 62, *args)
def added_token_counts(data_iterator, try_swapping, tokenizer, max_input_examples=10000, max_recursion_depth=10000): """Computes how many times different phrases have to be added. Args: data_iterator: Iterator to yield source lists and targets. See function yield_sources_and_targets in utils.py for the available iterators. The strings in the source list will be concatenated, possibly after swapping their order if swapping is enabled. try_swapping: Whether to try if swapping sources results in less added text. tokenizer: Text tokenizer (derived from tokenization.FullTokenizer). max_input_examples: Maximum number of examples to be read from the iterator. max_recursion_depth: Maximum recursion depth for LCS. If a long example surpasses this recursion depth, the given example is skipped and a warning is logged. Returns: Tuple (collections.Counter for phrases, added phrases for each example). """ phrase_counter = collections.Counter() num_examples = 0 all_added_phrases = [] for sources, target in data_iterator: if num_examples >= max_input_examples: break logging.log_every_n(logging.INFO, f'{num_examples} examples processed.', 1000) source_tokens = [t.lower() for t in tokenizer.tokenize(' '.join(sources))] target_tokens = [t.lower() for t in tokenizer.tokenize(target)] with _recursion_limit(max_recursion_depth): try: added_phrases = _get_added_phrases(source_tokens, target_tokens) if try_swapping and len(sources) == 2: source_tokens_swap = [ t.lower() for t in tokenizer.tokenize(' '.join(sources[::-1])) ] added_phrases_swap = _get_added_phrases(source_tokens_swap, target_tokens) # If we can align more and have to add less after swapping, we assume # that the sources would be swapped during conversion. if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)): added_phrases = added_phrases_swap except RecursionError: logging.log_first_n( logging.WARNING, 'Skipping a too long source. Consider increasing ' '`max_recursion_depth` argument of the `added_token_counts` ' 'function in phrase_vocabulary_optimization_utils.py to keep this ' f'source: {" ".join(source_tokens)}', 100) continue for phrase in added_phrases: phrase_counter[phrase] += 1 all_added_phrases.append(added_phrases) num_examples += 1 logging.info('%d examples processed.\n', num_examples) return phrase_counter, all_added_phrases
def main(argv): logging.get_absl_handler().start_logging_to_file(FLAGS.log_file) print("logging path: ", logging.get_log_file_name()) logging.info('Running under Python {0[0]}.{0[1]}.{0[2]}'.format( sys.version_info)) logging.info("logging level: %d" % logging.get_verbosity()) for i in range(50): logging.log_every_n(logging.INFO, "log_every_10", 10)
def encode(all_sections, model_path=None, chunk=0, chunk_size=2500, model=None): logging.info('loading model...') if model is None: model = sent2vec.Sent2vecModel() try: model.load_model(model_path) except Exception as e: print(e) logging.info('model successfully loaded') stop_words = set(stopwords.words('english')) chunk_meta = [] chunk_vecs = [] sorted_keys = list(all_sections.keys()) sorted(sorted_keys) chunk_keys = sorted_keys[(chunk * chunk_size):((chunk + 1) * chunk_size)] logging.info('Running on keys %s...', str(chunk_keys[0:5])) def preprocess_sentence(text): text = text.replace('/', ' / ') text = text.replace('.-', ' .- ') text = text.replace('.', ' . ') text = text.replace('\'', ' \' ') text = text.lower() tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words] return ' '.join(tokens) for k_idx, k in enumerate(chunk_keys): s_doc = time.time() logging.info('key %s (%s of %s) ', k, k_idx, len(chunk_keys)) sentences = load_sents(all_sections, k) dim = model.get_emb_size() vectors = np.zeros((len(sentences), dim)) gt = time.time t = gt() counter = 0 for doc_id, sec_id, sentence_id, s in sentences: vectors[counter] = model.embed_sentence(preprocess_sentence(s)) logging.log_every_n(logging.INFO, 'Processed %s sentences | %s seconds', 10, sentence_id, str(gt() - t)) counter += 1 e_t = gt() logging.info('Done! Processed %s Sentences | %s seconds', len(sentences), str(e_t - t)) chunk_meta.extend(sentences) chunk_vecs.append(vectors) e_doc = time.time() logging.info('key %s (%s of %s)... %s seconds ', k, k_idx, len(chunk_keys), e_doc - s_doc) return chunk_vecs, chunk_meta
def child_parent_norm_loss(self, pairs): internal_norms = poincare_norm(self.internals) children = tf.gather(internal_norms, pairs[:, 0]) parents = tf.gather(internal_norms, pairs[:, 1]) logits1 = tf.nn.relu(parents - children + self.gamma) min_norm = tf.argmin(internal_norms).numpy()[0] logging.log_every_n(logging.INFO, 'min_norm %s %s', 500, min_norm, internal_norms[min_norm]) max_norm = tf.argmax(internal_norms).numpy()[0] logging.log_every_n(logging.INFO, 'max_norm %s %s', 500, max_norm, internal_norms[max_norm]) return tf.reduce_sum(logits1)
def load_location_mentions(filename, st=0, N=np.Inf, skip_first_line=True): logging.info('Loading location mentions from %s', filename) with open(filename, 'r') as fin: for idx, line in enumerate(fin): if idx == 0 and skip_first_line: continue logging.log_every_n(logging.INFO, 'Loaded %s lines of %s', 1000, idx, filename) if idx > N: logging.info('Loaded %s lines of %s', idx, filename) return elif idx >= st: yield LocationMention.from_line(line)
def _wait_for_lock_to_disappear(handle, lock_file, lock_file_timeout_sec): """Waits for the lock file to disappear. The lock file was created by another process that is performing a download into its own temporary directory. The name of this temp directory is sha1(<module>).<uuid>.tmp where <uuid> comes from the lock file. Args: handle: The location from where a module is being download. lock_file: Lock file created by another process downloading this module. lock_file_timeout_sec: The amount of time to wait (in seconds) before we can declare that the other downloaded has been abandoned. The download is declared abandoned if there is no file size change in the temporary directory within the last 'lock_file_timeout_sec'. """ locked_tmp_dir_size = 0 locked_tmp_dir_size_check_time = time.time() lock_file_content = None while tf.compat.v1.gfile.Exists(lock_file): try: logging.log_every_n( logging.INFO, "Module '%s' already being downloaded by '%s'. Waiting.", 10, handle, tf_utils.read_file_to_string(lock_file)) if (time.time() - locked_tmp_dir_size_check_time > lock_file_timeout_sec): # Check whether the holder of the current lock downloaded anything # in its temporary directory in the last 'lock_file_timeout_sec'. cur_locked_tmp_dir_size = _locked_tmp_dir_size(lock_file) cur_lock_file_content = tf_utils.read_file_to_string(lock_file) if (cur_locked_tmp_dir_size == locked_tmp_dir_size and cur_lock_file_content == lock_file_content): # There is was no data downloaded in the past # 'lock_file_timeout_sec'. Steal the lock and proceed with the # local download. logging.warning("Deleting lock file %s due to inactivity.", lock_file) tf.compat.v1.gfile.Remove(lock_file) break locked_tmp_dir_size = cur_locked_tmp_dir_size locked_tmp_dir_size_check_time = time.time() lock_file_content = cur_lock_file_content except tf.errors.NotFoundError: # Lock file or temp directory were deleted during check. Continue # to check whether download succeeded or we need to start our own # download. pass finally: time.sleep(5)
def _added_token_counts(data_iterator, try_swapping, max_input_examples=10000): """Computes how many times different phrases have to be added. 计算需要添加多少个不同的短语 Args: data_iterator: Iterator to yield source lists and targets. See function yield_sources_and_targets in utils.py for the available iterators. The strings in the source list will be concatenated, possibly after swapping their order if swapping is enabled. try_swapping: Whether to try if swapping sources results in less added text. max_input_examples: Maximum number of examples to be read from the iterator. Returns: Tuple (collections.Counter for phrases, added phrases for each example). """ phrase_counter = collections.Counter() num_examples = 0 all_added_phrases = [] max_seq_length = 0 for sources, target in data_iterator: # sources 可能是多句话,后面用空格拼接起来 if num_examples >= max_input_examples: break # source_merge = ' '.join(sources) source_merge = sources #print("phrase_vocabulary_optimization.py source_merge",source_merge) if len(source_merge) > max_seq_length: print( curLine(), "max_seq_length=%d, len(source_merge)=%d,source_merge:%s" % (max_seq_length, len(source_merge), source_merge)) max_seq_length = len(source_merge) logging.log_every_n(logging.INFO, f'{num_examples} examples processed.', 10000) added_phrases = _get_added_phrases(source_merge, target) #print("added_phrases",added_phrases) if try_swapping and len(sources) == 2: added_phrases_swap = _get_added_phrases(' '.join(sources[::-1]), target) # If we can align more and have to add less after swapping, we assume that # the sources would be swapped during conversion. if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)): added_phrases = added_phrases_swap for phrase in added_phrases: phrase_counter[phrase] += 1 all_added_phrases.append(added_phrases) num_examples += 1 logging.info(f'{num_examples} examples processed.\n') return phrase_counter, all_added_phrases, max_seq_length
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') flags.mark_flag_as_required('input_file') flags.mark_flag_as_required('input_format') flags.mark_flag_as_required('output_tfrecord_train') flags.mark_flag_as_required('output_tfrecord_dev') flags.mark_flag_as_required('vocab_file') builder = bert_example.BertExampleBuilder({}, FLAGS.vocab_file, FLAGS.max_seq_length, FLAGS.do_lower_case) num_converted = 0 num_ignored = 0 with tf.python_io.TFRecordWriter( FLAGS.output_tfrecord_train) as writer_train: for input_file in [FLAGS.input_file]: print(curLine(), "input_file:", input_file) for i, (sources, target) in enumerate( utils.yield_sources_and_targets(input_file, FLAGS.input_format)): logging.log_every_n( logging.INFO, f'{i} examples processed, {num_converted} converted to tf.Example.', 10000) if len(sources[-1]) > FLAGS.max_seq_length: # TODO 忽略问题太长的样本 num_ignored += 1 print( curLine(), "ignore num_ignored=%d, question length=%d" % (num_ignored, len(sources[-1]))) continue example1, _ = builder.build_bert_example(sources, target) example = example1.to_tf_example().SerializeToString() writer_train.write(example) num_converted += 1 logging.info( f'Done. {num_converted} examples converted to tf.Example, num_ignored {num_ignored} examples.' ) for output_file in [ FLAGS.output_tfrecord_train, FLAGS.output_tfrecord_dev ]: count_fname = _write_example_count(num_converted, output_file=output_file) logging.info(f'Wrote:\n{output_file}\n{count_fname}') with open(FLAGS.label_map_file, "w") as f: json.dump(builder._label_map, f, ensure_ascii=False, indent=4) print(curLine(), "save %d to %s" % (len(builder._label_map), FLAGS.label_map_file))
def write_variants_to_vcf(variant_generator, output_vcf_path, header): """Writes Variant protos to a VCF file. Args: variant_generator: generator. A generator that yields sorted Variant protos. output_vcf_path: str. Output file in VCF format. header: VcfHeader proto. The VCF header to use for writing the variants. """ logging.info('Writing output to VCF file: %s', output_vcf_path) with vcf.VcfWriter(output_vcf_path, header=header, round_qualities=True) as writer: for idx, variant in enumerate(variant_generator): logging.log_every_n(logging.INFO, '%s variants written.', _LOG_EVERY_N, idx + 1) writer.write(variant)
def build_granted(config): # | uuid | patent_id | assignee_id | rawlocation_id | type | name_first | name_last | organization | sequence | cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT uuid , patent_id , assignee_id , rawlocation_id , type , name_first , name_last , organization , sequence FROM rawassignee;" cursor.execute(query) feature_map = collections.defaultdict(list) idx = 0 for rec in cursor: am = AssigneeMention.from_granted_sql_record(rec) feature_map[am.name_features()[0]].append(am) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s granted records - %s features', 10000, idx, len(feature_map)) return feature_map
def build_pregrants(config): # | id | document_number | sequence | name_first | name_last | organization | type | rawlocation_id | city | state | country | filename | created_date | updated_date | cnx = pvdb.pregranted_table(config) cursor = cnx.cursor() query = "SELECT id, document_number, sequence -1 as sequence, name_first, name_last, organization, type, rawlocation_id, city, state, country FROM rawassignee" cursor.execute(query) feature_map = collections.defaultdict(list) idx = 0 for rec in cursor: am = AssigneeMention.from_application_sql_record(rec) feature_map[am.name_features()[0]].append(am) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s pregrant records - %s features', 10000, idx, len(feature_map)) return feature_map
def _consolidate_numeric_values( row_index_to_values, min_consolidation_fraction, debug_info): """Finds the most common numeric values in a column and returns them. Args: row_index_to_values: For each row index all the values in that cell. min_consolidation_fraction: Fraction of cells that need to have consolidated value. debug_info: Additional information only used for logging. Returns: For each row index the first value that matches the most common value. Rows that don't have a matching value are dropped. Empty list if values can't be consolidated. """ type_counts = collections.Counter() for numeric_values in row_index_to_values.values(): type_counts.update(_get_all_types(numeric_values)) if not type_counts: return {} max_count = max(type_counts.values()) if max_count < len(row_index_to_values) * min_consolidation_fraction: logging.log_every_n(logging.INFO, 'Can\'t consolidate types: %s %s %d', 100, debug_info, row_index_to_values, max_count) return {} valid_types = set() for value_type, count in type_counts.items(): if count == max_count: valid_types.add(value_type) if len(valid_types) > 1: assert constants.DATE_TYPE in valid_types max_type = constants.DATE_TYPE else: max_type = next(iter(valid_types)) new_row_index_to_value = {} for index, values in row_index_to_values.items(): # Extract the first matching value. for value in values: if _get_value_type(value) == max_type: new_row_index_to_value[index] = value break return new_row_index_to_value
def build_granted(config): feature_map = collections.defaultdict(list) cnx = pvdb.incremental_granted_table(config) if cnx is None: return feature_map cursor = cnx.cursor() query = "SELECT uuid, patent_id, name_first, name_last FROM rawinventor;" cursor.execute(query) idx = 0 for uuid, patent_id, name_first, name_last in cursor: im = InventorMention(uuid, patent_id, '', name_first if name_first else '', name_last if name_last else '', '', '', '') feature_map[im.record_id].append(last_name(im)) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s granted records - %s features', 10000, idx, len(feature_map)) logging.log(logging.INFO, 'Processed %s granted records - %s features', idx, len(feature_map)) return feature_map
def build_granted(): cnx = mysql.connector.connect(option_files=os.path.join( os.environ['HOME'], '.mylogin.cnf'), database='patent_20200630') cursor = cnx.cursor() query = "SELECT id,title FROM patent;" cursor.execute(query) feature_map = dict() idx = 0 for rec in cursor: record_id = '%s' % rec[0] feature_map[record_id] = rec[1] idx += 1 logging.log_every_n(logging.INFO, 'Processed %s grant records - %s features', 10000, idx, len(feature_map)) return feature_map
def build_pregrants(): cnx = mysql.connector.connect(option_files=os.path.join( os.environ['HOME'], '.mylogin.cnf'), database='pregrant_publications') cursor = cnx.cursor() query = "select document_number,invention_title from application;" cursor.execute(query) feature_map = dict() idx = 0 for rec in cursor: record_id = 'pg-%s' % rec[0] feature_map[record_id] = rec[1] idx += 1 logging.log_every_n(logging.INFO, 'Processed %s pregrant records - %s features', 10000, idx, len(feature_map)) return feature_map
def main(argv): del argv contigs = fasta.RefFastaReader(FLAGS.ref).header.contigs max_records = FLAGS.max_records if FLAGS.max_records >= 0 else None variants_iter = examples_to_variants(FLAGS.examples, max_records=max_records) if not FLAGS.sample_name: sample_name, variants_iter = peek_sample_name(variants_iter) else: sample_name = FLAGS.sample_name header = dv_vcf_constants.deepvariant_header( contigs=contigs, sample_names=[sample_name]) with vcf.VcfWriter(FLAGS.output_vcf, header=header) as writer: for variant in variants_iter: variant.calls[0].call_set_name = sample_name logging.log_every_n(logging.INFO, 'Converted %s', FLAGS.log_every, variant_utils.variant_key(variant)) writer.write(variant)