Ejemplo n.º 1
0
def run_actor(
    agent: Agent,
    rng_key: jnp.ndarray,
    get_params: Callable[[], hk.Params],
    enqueue_traj: Callable[[Transition], None],
    unroll_len: int,
    num_trajectories: int,
):
  """Runs an actor to produce num_trajectories trajectories."""
  env = catch.Catch()
  state = env.reset()
  traj = []

  for i in range(num_trajectories):
    params = get_params()
    # The first rollout is one step longer.
    for _ in range(unroll_len + int(i == 0)):
      rng_key, step_key = jax.random.split(rng_key)
      state = preprocess_step(state)
      action, logits = agent.step(params, step_key, state)
      transition = Transition(state, action, logits)
      traj.append(transition)
      state = env.step(action)
      if state.step_type == dm_env.StepType.LAST:
        logging.log_every_n(logging.INFO, 'Episode ended with reward: %s', 5,
                            state.reward)

    # Stack and send the trajectory.
    stacked_traj = jax.tree_multimap(lambda *ts: np.stack(ts), *traj)
    enqueue_traj(stacked_traj)
    # Reset the trajectory, keeping the last timestep.
    traj = traj[-1:]
Ejemplo n.º 2
0
def _add_tables(input_dir, interaction_dict):
    """Adds table protos to all interactions."""
    table_files = set()
    for interactions in interaction_dict.values():
        for interaction in interactions:
            table_files.add(interaction.table.table_id)

    table_dict = {}
    for index, table_file in enumerate(sorted(table_files)):
        logging.log_every_n(logging.INFO, 'Read %4d / %4d table files', 100,
                            index, len(table_files))
        table_path = os.path.join(input_dir, table_file)
        with open(table_path, "r") as table_handle:
            table = Table()
            rows = list(csv.reader(table_handle))
            headers, rows = rows[0], rows[1:]

            for header in headers:
                table.columns.add().text = header

            for row in rows:
                new_row = table.rows.add()
                for cell in row:
                    new_row.cells.add().text = cell

            table.table_id = table_file
            table_dict[table_file] = table

    for interactions in interaction_dict.values():
        for interaction in interactions:
            interaction.table.CopyFrom(table_dict[interaction.table.table_id])
Ejemplo n.º 3
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')
  flags.mark_flag_as_required('input_file')
  flags.mark_flag_as_required('input_format')
  flags.mark_flag_as_required('output_file')
  flags.mark_flag_as_required('label_map_file')
  flags.mark_flag_as_required('vocab_file')
  flags.mark_flag_as_required('saved_model')

  label_map = utils.read_label_map(FLAGS.label_map_file)
  converter = tagging_converter.TaggingConverter(
      tagging_converter.get_phrase_vocabulary_from_label_map(label_map),
      FLAGS.enable_swap_tag)
  builder = bert_example.BertExampleBuilder(label_map, FLAGS.vocab_file,
                                            FLAGS.max_seq_length,
                                            FLAGS.do_lower_case, converter)
  predictor = predict_utils.LaserTaggerPredictor(
      tf.contrib.predictor.from_saved_model(FLAGS.saved_model), builder,
      label_map)

  num_predicted = 0
  with tf.gfile.Open(FLAGS.output_file, 'w') as writer:
    for i, (sources, target) in enumerate(utils.yield_sources_and_targets(
        FLAGS.input_file, FLAGS.input_format)):
      logging.log_every_n(
          logging.INFO,
          f'{i} examples processed, {num_predicted} converted to tf.Example.',
          100)
      prediction = predictor.predict(sources)
      writer.write(f'{" ".join(sources)}\t{prediction}\t{target}\n')
      num_predicted += 1
  logging.info(f'{num_predicted} predictions saved to:\n{FLAGS.output_file}')
Ejemplo n.º 4
0
def get_answer_indexes(
    prediction,
    cell_classification_threshold,
):
    """Computes answer indexes."""
    input_ids = prediction["input_ids"]

    span_indexes = prediction.get("span_indexes")
    span_logits = prediction.get("span_logits")
    if span_indexes is not None and span_logits is not None:
        best_logit, best_span = max(zip(span_logits, span_indexes.tolist()), )
        logging.log_every_n(
            logging.INFO,
            "best_span: %s, score: %s",
            500,
            best_span,
            best_logit,
        )
        return [input_ids[i] for i in range(best_span[0], best_span[1] + 1)]

    answers = []
    for i, prob in get_cell_token_probs(prediction):
        if prob > cell_classification_threshold:
            answers.append(input_ids[i])
    return answers
Ejemplo n.º 5
0
    async def log_kafka_sample_async(self, topic: str, sample: dict):
        """logs sample to Kafka topic asynchronously
        Sample for monitoring purpose:
        Supports logging samples to Kafka via REST API (Confluent)

        Column guidelines:
        time: epoch in seconds
        sample_rate: pre-sampled record shall set this to sample rate, e.g., 100 means one sample is logged out of 100
        column type shall be log int, str, or vector of str
        """
        if self.cluster_config.MONITORING.KAFKA_REST_ADDRESS == "":
            return
        url = "http://{}/topics/{}".format(
            self.cluster_config.MONITORING.KAFKA_REST_ADDRESS, topic)
        try:
            record_data = json.dumps({"records": [{"value": sample}]})
            headers = {
                "Content-Type": "application/vnd.kafka.json.v2+json",
                "Accept": "application/vnd.kafka.v2+json",
            }
            session = aiohttp.ClientSession()
            response = await session.post(url,
                                          data=record_data,
                                          headers=headers)
            if response.status != 200:
                raise Exception("non-OK response status code: {}".format(
                    response.status_code))
        except Exception as ex:
            GLOG.log_every_n(GLOG.ERROR, "Failed to log sample to Kafka: %s",
                             100, ex)
        finally:
            await session.close()
Ejemplo n.º 6
0
def gym_env_wrapper(env, rl_env_max_episode_steps, maxskip_env, rendered_env,
                    rendered_env_resize_to, sticky_actions, output_dtype,
                    num_actions):
    """Wraps a gym environment. see make_gym_env for details."""
    # rl_env_max_episode_steps is None or int.
    assert ((not rl_env_max_episode_steps)
            or isinstance(rl_env_max_episode_steps, int))

    wrap_with_time_limit = ((not rl_env_max_episode_steps)
                            or rl_env_max_episode_steps >= 0)

    if wrap_with_time_limit:
        env = remove_time_limit_wrapper(env)

    if num_actions is not None:
        logging.log_every_n(logging.INFO, "Number of discretized actions: %d",
                            1, num_actions)
        env = ActionDiscretizeWrapper(env, num_actions=num_actions)

    if sticky_actions:
        env = StickyActionEnv(env)

    if maxskip_env:
        env = MaxAndSkipEnv(env)  # pylint: disable=redefined-variable-type

    if rendered_env:
        env = RenderedEnv(env,
                          resize_to=rendered_env_resize_to,
                          output_dtype=output_dtype)

    if wrap_with_time_limit and rl_env_max_episode_steps is not None:
        env = gym.wrappers.TimeLimit(
            env, max_episode_steps=rl_env_max_episode_steps)
    return env
Ejemplo n.º 7
0
def _filter_keyword_group_quality(keyword_group_set):
    """Check if keywords for image uid group match quality requirement."""
    keyword_group_set_tuple = tuple(
        np.asarray(x) for x in zip(*keyword_group_set))

    valid_idx = []
    for index, next_keyword in enumerate(keyword_group_set):
        uid, _, keyword, lemma = next_keyword

        keyword_idx = np.union1d(
            np.where(keyword_group_set_tuple[2] == keyword)[0],
            np.where(keyword_group_set_tuple[3] == lemma)[0])
        keyword_idx = np.intersect1d(
            keyword_idx,
            np.where(keyword_group_set_tuple[0] == uid)[0])

        num_unique_captions = len(set(keyword_group_set_tuple[1][keyword_idx]))

        if num_unique_captions >= mp.SHARED_ARGS[0]:
            if "debug" in FLAGS and FLAGS.debug:
                logging.log_every_n(
                    logging.DEBUG,
                    "Keeping image keyword '{}' which occurs in {} (>= {}) captions"
                    .format(keyword, num_unique_captions,
                            mp.SHARED_ARGS[0]), 1000)
            valid_idx.append(index)
        else:
            if "debug" in FLAGS and FLAGS.debug:
                logging.log_every_n(
                    logging.DEBUG,
                    "Throwing image keyword '{}' which occurs in {} (< {}) captions"
                    .format(keyword, num_unique_captions,
                            mp.SHARED_ARGS[0]), 1000)

    return tuple(x[valid_idx] for x in keyword_group_set_tuple)
Ejemplo n.º 8
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')
  flags.mark_flag_as_required('input_file')
  flags.mark_flag_as_required('input_format')
  flags.mark_flag_as_required('output_tfrecord')
  flags.mark_flag_as_required('label_map_file')
  flags.mark_flag_as_required('vocab_file')

  label_map = utils.read_label_map(FLAGS.label_map_file)
  converter = tagging_converter.TaggingConverter(
      tagging_converter.get_phrase_vocabulary_from_label_map(label_map),
      FLAGS.enable_swap_tag)
  builder = bert_example.BertExampleBuilder(label_map, FLAGS.vocab_file,
                                            FLAGS.max_seq_length,
                                            FLAGS.do_lower_case, converter)

  num_converted = 0
  with tf.io.TFRecordWriter(FLAGS.output_tfrecord) as writer:
    for i, (sources, target) in enumerate(utils.yield_sources_and_targets(
        FLAGS.input_file, FLAGS.input_format)):
      logging.log_every_n(
          logging.INFO,
          f'{i} examples processed, {num_converted} converted to tf.Example.',
          10000)
      example = builder.build_bert_example(
          sources, target,
          FLAGS.output_arbitrary_targets_for_infeasible_examples)
      if example is None:
        continue
      writer.write(example.to_tf_example().SerializeToString())
      num_converted += 1
  logging.info(f'Done. {num_converted} examples converted to tf.Example.')
  count_fname = _write_example_count(num_converted)
  logging.info(f'Wrote:\n{FLAGS.output_tfrecord}\n{count_fname}')
Ejemplo n.º 9
0
def convert(in_filename, out_filename):
    """Converts a recognized genomics file `in_filename` to `out_filename`.

  Args:
    in_filename: str; filename of a genomics data file to use as input.
    out_filename: str; filename of a genomics data file to use as output, or
      None, if no output should be written.

  Raises:
    ConversionError, if the conversion could not be executed.
  """
    reader_class, writer_class = _reader_writer_classes(
        in_filename, out_filename)
    reader = reader_class(in_filename)

    with reader_class(in_filename) as reader:
        with writer_class(out_filename, header=reader.header) as writer:
            start = time.time()
            i = 0
            for record in reader:
                i += 1
                writer.write(record)
                logging.log_every_n(logging.INFO, "Progress: %d records",
                                    LOG_EVERY, i)
            elapsed = time.time() - start
            logging.info("Done, processed %d records in %0.2f seconds.", i,
                         elapsed)
Ejemplo n.º 10
0
    def feature_model_sparsity_loss(self, lambda1, lambda2, avg1, avg2):
        if avg1:
            word_l2_dist = torch.mean(torch.pow(self.word_emb - self.word_emb_cache, 2))
            feat_l2_dist = torch.mean(torch.pow(self.feature_emb - self.feature_emb_cache, 2))
        else:
            word_l2_dist = torch.sum(torch.pow(self.word_emb - self.word_emb_cache, 2))
            feat_l2_dist = torch.sum(torch.pow(self.feature_emb - self.feature_emb_cache, 2))
        # z = F.relu(torch.matmul(self.word_emb, torch.transpose(self.feature_emb, 1, 0)) - self.feature_relu_bias)
        # b = Bernoulli(F.sigmoid(torch.matmul(self.word_emb, torch.transpose(self.feature_emb, 1, 0))))
        # z = b.sample()
        z = self.compute_z()
        z_sum = z.sum()
        # z_gt_0 = torch.zeros_like(z)
        z_gt_0 = torch.sign(z)
        z_gt_0_sum = z_gt_0.sum()
        if avg2:
            z_sparse = z_sum.sum()
        else:
            z_sparse = z_sum.sum(dim=1).mean()

        loss = lambda1 * word_l2_dist + lambda1 * feat_l2_dist + lambda2 * z_sparse
        logging.log_every_n(logging.INFO, 'loss %s | word %s | feat %s | z %s | z_sum %s | z > 0 %s',
                            100, loss.cpu().detach().numpy(), word_l2_dist.cpu().detach().numpy(), feat_l2_dist.cpu().detach().numpy(), z_sparse.cpu().detach().numpy(), z_sum.cpu().detach().numpy(), z_gt_0_sum.cpu().detach().numpy())
        # logging.info('z.sum() = %s, (z > 0).sum() = %s bias = %s', z.sum(), (z > 0).sum(), self.feature_relu_bias)
        return loss
Ejemplo n.º 11
0
def _get_token_answers(
    prediction,
    cell_classification_threshold,
):
    """Computes answer indexes."""
    span_indexes = prediction.get("span_indexes")
    span_logits = prediction.get("span_logits")
    if span_indexes is not None and span_logits is not None:
        best_logit, best_span = max(zip(span_logits, span_indexes.tolist()), )
        logging.log_every_n(
            logging.INFO,
            "best_span: %s, score: %s",
            500,
            best_span,
            best_logit,
        )
        return [
            _to_token_answer(
                prediction,
                best_span[0],
                best_span[1] + 1,
                best_logit,
            )
        ]

    answers = []

    answer_begin_index = None
    answer_end_index = None
    answer_probs = []
    for i, prob in get_cell_token_probs(prediction):
        if prob > cell_classification_threshold:

            if answer_end_index is not None:
                if answer_end_index < i:
                    # There is a gap between the current answer and the new index.
                    answers.append(
                        _to_token_answer(
                            prediction,
                            answer_begin_index,
                            answer_end_index,
                            _geometric_mean(answer_probs),
                        ))
                    answer_begin_index = None
                    answer_end_index = None
                    answer_probs.clear()

            if answer_begin_index is None:
                answer_begin_index = i
            answer_end_index = i + 1
            answer_probs.append(prob)
    if answer_begin_index is not None:
        answers.append(
            _to_token_answer(
                prediction,
                begin_token_index=answer_begin_index,
                end_token_index=answer_end_index,
                score=_geometric_mean(answer_probs),
            ))
    return answers
Ejemplo n.º 12
0
def build_pregrants():
    cnx = mysql.connector.connect(option_files=os.path.join(
        os.environ['HOME'], '.mylogin.cnf'),
                                  database='pregrant_publications')
    cursor = cnx.cursor()
    query = "SELECT id, document_number, name_first, name_last FROM rawinventor;"
    cursor.execute(query)
    feature_map = collections.defaultdict(list)
    idx = 0
    for uuid, document_number, name_first, name_last in cursor:
        im = InventorMention(uuid,
                             None,
                             '',
                             name_first if name_first else '',
                             name_last if name_last else '',
                             '',
                             '',
                             '',
                             document_number=document_number)
        feature_map[im.record_id].append(last_name(im))
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s pregrant records - %s features',
                            10000, idx, len(feature_map))
    return feature_map
Ejemplo n.º 13
0
    def convert(self, rng, interaction, random_table):
        """Creates TF example from interaction."""
        question_tokens = self._get_question_tokens(interaction, rng)

        if random_table is not None and rng.random() < 0.5:
            is_random_table = True
            table = random_table
        else:
            is_random_table = False
            if interaction.HasField('table'):
                table = interaction.table
            else:
                table = None

        if table is None:
            question_tokens = self._tokenizer.tokenize(
                interaction.questions[0].original_text)
            question_tokens = question_tokens[:self._max_seq_length - 1]
            tokens, segment_ids, column_ids, row_ids = self._serialize_text(
                question_tokens)
        else:
            if not question_tokens:
                return None
            if random_table is not None:
                logging.log_every_n(
                    logging.INFO,
                    'Table: %s Random Table: %s is_random_table: %s', 500000,
                    interaction.table.table_id, random_table.table_id,
                    is_random_table)

            token_budget = self._get_token_budget(question_tokens)
            tokenized_table = self._tokenize_table(table)
            try:
                num_columns, num_rows, num_tokens = self._get_table_sizes(
                    token_budget, tokenized_table, rng)
            except ValueError:
                return None

            serialized_example = self._serialize(question_tokens,
                                                 tokenized_table, num_columns,
                                                 num_rows, num_tokens)
            tokens = serialized_example.tokens
            segment_ids = serialized_example.segment_ids
            row_ids = serialized_example.row_ids
            column_ids = serialized_example.column_ids

        assert len(tokens) <= self._max_seq_length

        (tokens, masked_lm_positions,
         masked_lm_labels) = self._create_masked_lm_predictions(
             interaction, tokens, column_ids, row_ids, rng)
        instance = TrainingInstance(tokens=tokens,
                                    segment_ids=segment_ids,
                                    column_ids=column_ids,
                                    row_ids=row_ids,
                                    masked_lm_positions=masked_lm_positions,
                                    masked_lm_labels=masked_lm_labels,
                                    is_random_table=is_random_table)
        return self._to_example(table, instance)
Ejemplo n.º 14
0
def evaluate():
    """Eval happens on GPU or CPU, and evals each checkpoint as it appears."""

    candidate_checkpoint = None
    smurf = build_network(batch_size=1)

    weights = {
        'census': FLAGS.weight_census,
        'smooth1': FLAGS.weight_smooth1,
        'smooth2': FLAGS.weight_smooth2,
    }
    evaluate_fn, _ = smurf_data.make_eval_function(FLAGS.eval_on,
                                                   FLAGS.height,
                                                   FLAGS.width,
                                                   progress_bar=True,
                                                   plot_dir=FLAGS.plot_dir,
                                                   num_plots=50,
                                                   weights=weights)

    latest_checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    while 1:
        # Wait for a new checkpoint
        while candidate_checkpoint == latest_checkpoint:
            logging.log_every_n(
                logging.INFO,
                'Waiting for a new checkpoint, at %s, latest is %s', 3,
                FLAGS.checkpoint_dir, latest_checkpoint)
            time.sleep(45.)
            candidate_checkpoint = tf.train.latest_checkpoint(
                FLAGS.checkpoint_dir)
        candidate_checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
        latest_checkpoint = candidate_checkpoint
        logging.info('New checkpoint found: %s', candidate_checkpoint)
        # This forces the checkpoint manager to reexamine the checkpoint directory
        # and become aware of the new checkpoint.
        smurf.update_checkpoint_dir(FLAGS.checkpoint_dir)
        smurf.restore()

        step = tf.compat.v1.train.get_global_step().numpy()
        terminate = False
        if step >= FLAGS.num_train_steps:
            # If initializing from another checkpoint directory, the first checkpoint
            # will be the init checkpoint and might have steps > num_train_steps.
            # Don't quit in this case.
            terminate = True
            if FLAGS.init_checkpoint_dir:
                with gfile.Open(
                        os.path.join(FLAGS.checkpoint_dir, 'checkpoint'),
                        'r') as f:
                    if len(f.readlines()) == 2:
                        logging.info('Continuing evaluation after evaluating '
                                     'init_checkpoint.')
                        terminate = False

        eval_results = evaluate_fn(smurf)
        smurf_plotting.print_eval(eval_results)

        if terminate or FLAGS.run_eval_once:
            return
Ejemplo n.º 15
0
def warning_once(msg, *args):
    """Generate warning message once

    Args:
        msg: str, the message to be logged.
        *args: The args to be substitued into the msg.
    """
    logging.log_every_n(logging.WARNING, msg, 1 << 62, *args)
Ejemplo n.º 16
0
def added_token_counts(data_iterator,
                       try_swapping,
                       tokenizer,
                       max_input_examples=10000,
                       max_recursion_depth=10000):
  """Computes how many times different phrases have to be added.

  Args:
    data_iterator: Iterator to yield source lists and targets. See function
      yield_sources_and_targets in utils.py for the available iterators. The
      strings in the source list will be concatenated, possibly after swapping
      their order if swapping is enabled.
    try_swapping: Whether to try if swapping sources results in less added text.
    tokenizer: Text tokenizer (derived from tokenization.FullTokenizer).
    max_input_examples: Maximum number of examples to be read from the iterator.
    max_recursion_depth: Maximum recursion depth for LCS. If a long example
      surpasses this recursion depth, the given example is skipped and a warning
      is logged.

  Returns:
    Tuple (collections.Counter for phrases, added phrases for each example).
  """
  phrase_counter = collections.Counter()
  num_examples = 0
  all_added_phrases = []
  for sources, target in data_iterator:
    if num_examples >= max_input_examples:
      break
    logging.log_every_n(logging.INFO, f'{num_examples} examples processed.',
                        1000)
    source_tokens = [t.lower() for t in tokenizer.tokenize(' '.join(sources))]
    target_tokens = [t.lower() for t in tokenizer.tokenize(target)]
    with _recursion_limit(max_recursion_depth):
      try:
        added_phrases = _get_added_phrases(source_tokens, target_tokens)
        if try_swapping and len(sources) == 2:
          source_tokens_swap = [
              t.lower() for t in tokenizer.tokenize(' '.join(sources[::-1]))
          ]
          added_phrases_swap = _get_added_phrases(source_tokens_swap,
                                                  target_tokens)
          # If we can align more and have to add less after swapping, we assume
          # that the sources would be swapped during conversion.
          if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)):
            added_phrases = added_phrases_swap
      except RecursionError:
        logging.log_first_n(
            logging.WARNING, 'Skipping a too long source. Consider increasing '
            '`max_recursion_depth` argument of the `added_token_counts` '
            'function in phrase_vocabulary_optimization_utils.py to keep this '
            f'source: {" ".join(source_tokens)}', 100)
        continue
    for phrase in added_phrases:
      phrase_counter[phrase] += 1
    all_added_phrases.append(added_phrases)
    num_examples += 1
  logging.info('%d examples processed.\n', num_examples)
  return phrase_counter, all_added_phrases
Ejemplo n.º 17
0
def main(argv):
    logging.get_absl_handler().start_logging_to_file(FLAGS.log_file)

    print("logging path: ", logging.get_log_file_name())
    logging.info('Running under Python {0[0]}.{0[1]}.{0[2]}'.format(
        sys.version_info))
    logging.info("logging level: %d" % logging.get_verbosity())

    for i in range(50):
        logging.log_every_n(logging.INFO, "log_every_10", 10)
Ejemplo n.º 18
0
def encode(all_sections, model_path=None, chunk=0, chunk_size=2500, model=None):
    logging.info('loading model...')
    if model is None:
        model = sent2vec.Sent2vecModel()
        try:
            model.load_model(model_path)
        except Exception as e:
            print(e)
        logging.info('model successfully loaded')

    stop_words = set(stopwords.words('english'))

    chunk_meta = []
    chunk_vecs = []

    sorted_keys = list(all_sections.keys())
    sorted(sorted_keys)

    chunk_keys = sorted_keys[(chunk * chunk_size):((chunk + 1) * chunk_size)]

    logging.info('Running on keys %s...', str(chunk_keys[0:5]))

    def preprocess_sentence(text):
        text = text.replace('/', ' / ')
        text = text.replace('.-', ' .- ')
        text = text.replace('.', ' . ')
        text = text.replace('\'', ' \' ')
        text = text.lower()

        tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

        return ' '.join(tokens)

    for k_idx, k in enumerate(chunk_keys):
        s_doc = time.time()
        logging.info('key %s (%s of %s) ', k, k_idx, len(chunk_keys))
        sentences = load_sents(all_sections, k)

        dim = model.get_emb_size()
        vectors = np.zeros((len(sentences), dim))
        gt = time.time
        t = gt()
        counter = 0
        for doc_id, sec_id, sentence_id, s in sentences:
            vectors[counter] = model.embed_sentence(preprocess_sentence(s))
            logging.log_every_n(logging.INFO, 'Processed %s sentences | %s seconds', 10, sentence_id, str(gt() - t))
            counter += 1
        e_t = gt()
        logging.info('Done! Processed %s Sentences | %s seconds', len(sentences), str(e_t - t))
        chunk_meta.extend(sentences)
        chunk_vecs.append(vectors)
        e_doc = time.time()
        logging.info('key %s (%s of %s)... %s seconds ', k, k_idx, len(chunk_keys), e_doc - s_doc)
    return chunk_vecs, chunk_meta
Ejemplo n.º 19
0
 def child_parent_norm_loss(self, pairs):
     internal_norms = poincare_norm(self.internals)
     children = tf.gather(internal_norms, pairs[:, 0])
     parents = tf.gather(internal_norms, pairs[:, 1])
     logits1 = tf.nn.relu(parents - children + self.gamma)
     min_norm = tf.argmin(internal_norms).numpy()[0]
     logging.log_every_n(logging.INFO, 'min_norm %s %s', 500, min_norm,
                         internal_norms[min_norm])
     max_norm = tf.argmax(internal_norms).numpy()[0]
     logging.log_every_n(logging.INFO, 'max_norm %s %s', 500, max_norm,
                         internal_norms[max_norm])
     return tf.reduce_sum(logits1)
Ejemplo n.º 20
0
def load_location_mentions(filename, st=0, N=np.Inf, skip_first_line=True):
    logging.info('Loading location mentions from %s', filename)
    with open(filename, 'r') as fin:
        for idx, line in enumerate(fin):
            if idx == 0 and skip_first_line:
                continue
            logging.log_every_n(logging.INFO, 'Loaded %s lines of %s', 1000, idx, filename)
            if idx > N:
                logging.info('Loaded %s lines of %s', idx, filename)
                return
            elif idx >= st:
                yield LocationMention.from_line(line)
Ejemplo n.º 21
0
def _wait_for_lock_to_disappear(handle, lock_file, lock_file_timeout_sec):
    """Waits for the lock file to disappear.

  The lock file was created by another process that is performing a download
  into its own temporary directory. The name of this temp directory is
  sha1(<module>).<uuid>.tmp where <uuid> comes from the lock file.

  Args:
    handle: The location from where a module is being download.
    lock_file: Lock file created by another process downloading this module.
    lock_file_timeout_sec: The amount of time to wait (in seconds) before we
                           can declare that the other downloaded has been
                           abandoned. The download is declared abandoned if
                           there is no file size change in the temporary
                           directory within the last 'lock_file_timeout_sec'.
  """
    locked_tmp_dir_size = 0
    locked_tmp_dir_size_check_time = time.time()
    lock_file_content = None
    while tf.compat.v1.gfile.Exists(lock_file):
        try:
            logging.log_every_n(
                logging.INFO,
                "Module '%s' already being downloaded by '%s'. Waiting.", 10,
                handle, tf_utils.read_file_to_string(lock_file))
            if (time.time() - locked_tmp_dir_size_check_time >
                    lock_file_timeout_sec):
                # Check whether the holder of the current lock downloaded anything
                # in its temporary directory in the last 'lock_file_timeout_sec'.
                cur_locked_tmp_dir_size = _locked_tmp_dir_size(lock_file)
                cur_lock_file_content = tf_utils.read_file_to_string(lock_file)
                if (cur_locked_tmp_dir_size == locked_tmp_dir_size
                        and cur_lock_file_content == lock_file_content):
                    # There is was no data downloaded in the past
                    # 'lock_file_timeout_sec'. Steal the lock and proceed with the
                    # local download.
                    logging.warning("Deleting lock file %s due to inactivity.",
                                    lock_file)
                    tf.compat.v1.gfile.Remove(lock_file)
                    break
                locked_tmp_dir_size = cur_locked_tmp_dir_size
                locked_tmp_dir_size_check_time = time.time()
                lock_file_content = cur_lock_file_content
        except tf.errors.NotFoundError:
            # Lock file or temp directory were deleted during check. Continue
            # to check whether download succeeded or we need to start our own
            # download.
            pass
        finally:
            time.sleep(5)
Ejemplo n.º 22
0
def _added_token_counts(data_iterator, try_swapping, max_input_examples=10000):
    """Computes how many times different phrases have to be added.
    计算需要添加多少个不同的短语
    Args:
      data_iterator: Iterator to yield source lists and targets. See function
        yield_sources_and_targets in utils.py for the available iterators. The
        strings in the source list will be concatenated, possibly after swapping
        their order if swapping is enabled.
      try_swapping: Whether to try if swapping sources results in less added text.
      max_input_examples: Maximum number of examples to be read from the iterator.

    Returns:
      Tuple (collections.Counter for phrases, added phrases for each example).
    """
    phrase_counter = collections.Counter()
    num_examples = 0
    all_added_phrases = []
    max_seq_length = 0
    for sources, target in data_iterator:
        # sources 可能是多句话,后面用空格拼接起来
        if num_examples >= max_input_examples:
            break
        # source_merge = ' '.join(sources)
        source_merge = sources

        #print("phrase_vocabulary_optimization.py source_merge",source_merge)
        if len(source_merge) > max_seq_length:
            print(
                curLine(),
                "max_seq_length=%d, len(source_merge)=%d,source_merge:%s" %
                (max_seq_length, len(source_merge), source_merge))
            max_seq_length = len(source_merge)
        logging.log_every_n(logging.INFO,
                            f'{num_examples} examples processed.', 10000)
        added_phrases = _get_added_phrases(source_merge, target)
        #print("added_phrases",added_phrases)
        if try_swapping and len(sources) == 2:
            added_phrases_swap = _get_added_phrases(' '.join(sources[::-1]),
                                                    target)
            # If we can align more and have to add less after swapping, we assume that
            # the sources would be swapped during conversion.
            if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)):
                added_phrases = added_phrases_swap
        for phrase in added_phrases:
            phrase_counter[phrase] += 1
        all_added_phrases.append(added_phrases)
        num_examples += 1
    logging.info(f'{num_examples} examples processed.\n')
    return phrase_counter, all_added_phrases, max_seq_length
Ejemplo n.º 23
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    flags.mark_flag_as_required('input_file')
    flags.mark_flag_as_required('input_format')
    flags.mark_flag_as_required('output_tfrecord_train')
    flags.mark_flag_as_required('output_tfrecord_dev')
    flags.mark_flag_as_required('vocab_file')
    builder = bert_example.BertExampleBuilder({}, FLAGS.vocab_file,
                                              FLAGS.max_seq_length,
                                              FLAGS.do_lower_case)

    num_converted = 0
    num_ignored = 0
    with tf.python_io.TFRecordWriter(
            FLAGS.output_tfrecord_train) as writer_train:
        for input_file in [FLAGS.input_file]:
            print(curLine(), "input_file:", input_file)
            for i, (sources, target) in enumerate(
                    utils.yield_sources_and_targets(input_file,
                                                    FLAGS.input_format)):
                logging.log_every_n(
                    logging.INFO,
                    f'{i} examples processed, {num_converted} converted to tf.Example.',
                    10000)
                if len(sources[-1]) > FLAGS.max_seq_length:  # TODO 忽略问题太长的样本
                    num_ignored += 1
                    print(
                        curLine(),
                        "ignore num_ignored=%d, question length=%d" %
                        (num_ignored, len(sources[-1])))
                    continue
                example1, _ = builder.build_bert_example(sources, target)
                example = example1.to_tf_example().SerializeToString()
                writer_train.write(example)
                num_converted += 1
    logging.info(
        f'Done. {num_converted} examples converted to tf.Example, num_ignored {num_ignored} examples.'
    )
    for output_file in [
            FLAGS.output_tfrecord_train, FLAGS.output_tfrecord_dev
    ]:
        count_fname = _write_example_count(num_converted,
                                           output_file=output_file)
        logging.info(f'Wrote:\n{output_file}\n{count_fname}')
    with open(FLAGS.label_map_file, "w") as f:
        json.dump(builder._label_map, f, ensure_ascii=False, indent=4)
    print(curLine(),
          "save %d to %s" % (len(builder._label_map), FLAGS.label_map_file))
Ejemplo n.º 24
0
def write_variants_to_vcf(variant_generator, output_vcf_path, header):
    """Writes Variant protos to a VCF file.

  Args:
    variant_generator: generator. A generator that yields sorted Variant protos.
    output_vcf_path: str. Output file in VCF format.
    header: VcfHeader proto. The VCF header to use for writing the variants.
  """
    logging.info('Writing output to VCF file: %s', output_vcf_path)
    with vcf.VcfWriter(output_vcf_path, header=header,
                       round_qualities=True) as writer:
        for idx, variant in enumerate(variant_generator):
            logging.log_every_n(logging.INFO, '%s variants written.',
                                _LOG_EVERY_N, idx + 1)
            writer.write(variant)
def build_granted(config):
    # | uuid | patent_id | assignee_id | rawlocation_id | type | name_first | name_last | organization | sequence |
    cnx = pvdb.granted_table(config)
    cursor = cnx.cursor()
    query = "SELECT uuid , patent_id , assignee_id , rawlocation_id , type , name_first , name_last , organization , sequence FROM rawassignee;"
    cursor.execute(query)
    feature_map = collections.defaultdict(list)
    idx = 0
    for rec in cursor:
        am = AssigneeMention.from_granted_sql_record(rec)
        feature_map[am.name_features()[0]].append(am)
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s granted records - %s features',
                            10000, idx, len(feature_map))
    return feature_map
def build_pregrants(config):
    # | id | document_number | sequence | name_first | name_last | organization | type | rawlocation_id | city | state | country | filename | created_date | updated_date |
    cnx = pvdb.pregranted_table(config)
    cursor = cnx.cursor()
    query = "SELECT id, document_number, sequence -1 as sequence, name_first, name_last, organization, type, rawlocation_id, city, state, country FROM rawassignee"
    cursor.execute(query)
    feature_map = collections.defaultdict(list)
    idx = 0
    for rec in cursor:
        am = AssigneeMention.from_application_sql_record(rec)
        feature_map[am.name_features()[0]].append(am)
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s pregrant records - %s features',
                            10000, idx, len(feature_map))
    return feature_map
Ejemplo n.º 27
0
def _consolidate_numeric_values(
    row_index_to_values,
    min_consolidation_fraction,
    debug_info):
  """Finds the most common numeric values in a column and returns them.

  Args:
   row_index_to_values: For each row index all the values in that cell.
   min_consolidation_fraction: Fraction of cells that need to have consolidated
     value.
   debug_info: Additional information only used for logging.

  Returns:
   For each row index the first value that matches the most common value.
   Rows that don't have a matching value are dropped. Empty list if values can't
   be consolidated.
  """
  type_counts = collections.Counter()
  for numeric_values in row_index_to_values.values():
    type_counts.update(_get_all_types(numeric_values))
  if not type_counts:
    return {}
  max_count = max(type_counts.values())
  if max_count < len(row_index_to_values) * min_consolidation_fraction:
    logging.log_every_n(logging.INFO, 'Can\'t consolidate types: %s %s %d', 100,
                        debug_info, row_index_to_values, max_count)
    return {}

  valid_types = set()
  for value_type, count in type_counts.items():
    if count == max_count:
      valid_types.add(value_type)
  if len(valid_types) > 1:
    assert constants.DATE_TYPE in valid_types
    max_type = constants.DATE_TYPE
  else:
    max_type = next(iter(valid_types))

  new_row_index_to_value = {}
  for index, values in row_index_to_values.items():
    # Extract the first matching value.
    for value in values:
      if _get_value_type(value) == max_type:
        new_row_index_to_value[index] = value
        break

  return new_row_index_to_value
Ejemplo n.º 28
0
def build_granted(config):
    feature_map = collections.defaultdict(list)
    cnx = pvdb.incremental_granted_table(config)
    if cnx is None:
        return feature_map
    cursor = cnx.cursor()
    query = "SELECT uuid, patent_id, name_first, name_last FROM rawinventor;"
    cursor.execute(query)
    idx = 0
    for uuid, patent_id, name_first, name_last in cursor:
        im = InventorMention(uuid, patent_id, '', name_first if name_first else '', name_last if name_last else '', '',
                             '', '')
        feature_map[im.record_id].append(last_name(im))
        idx += 1
        logging.log_every_n(logging.INFO, 'Processed %s granted records - %s features', 10000, idx, len(feature_map))
    logging.log(logging.INFO, 'Processed %s granted records - %s features', idx, len(feature_map))
    return feature_map
def build_granted():
    cnx = mysql.connector.connect(option_files=os.path.join(
        os.environ['HOME'], '.mylogin.cnf'),
                                  database='patent_20200630')
    cursor = cnx.cursor()
    query = "SELECT id,title FROM patent;"
    cursor.execute(query)
    feature_map = dict()
    idx = 0
    for rec in cursor:
        record_id = '%s' % rec[0]
        feature_map[record_id] = rec[1]
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s grant records - %s features', 10000,
                            idx, len(feature_map))
    return feature_map
def build_pregrants():
    cnx = mysql.connector.connect(option_files=os.path.join(
        os.environ['HOME'], '.mylogin.cnf'),
                                  database='pregrant_publications')
    cursor = cnx.cursor()
    query = "select document_number,invention_title from application;"
    cursor.execute(query)
    feature_map = dict()
    idx = 0
    for rec in cursor:
        record_id = 'pg-%s' % rec[0]
        feature_map[record_id] = rec[1]
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s pregrant records - %s features',
                            10000, idx, len(feature_map))
    return feature_map
Ejemplo n.º 31
0
def main(argv):
  del argv

  contigs = fasta.RefFastaReader(FLAGS.ref).header.contigs
  max_records = FLAGS.max_records if FLAGS.max_records >= 0 else None
  variants_iter = examples_to_variants(FLAGS.examples, max_records=max_records)

  if not FLAGS.sample_name:
    sample_name, variants_iter = peek_sample_name(variants_iter)
  else:
    sample_name = FLAGS.sample_name
  header = dv_vcf_constants.deepvariant_header(
      contigs=contigs, sample_names=[sample_name])
  with vcf.VcfWriter(FLAGS.output_vcf, header=header) as writer:
    for variant in variants_iter:
      variant.calls[0].call_set_name = sample_name
      logging.log_every_n(logging.INFO, 'Converted %s', FLAGS.log_every,
                          variant_utils.variant_key(variant))
      writer.write(variant)