Beispiel #1
0
def fasttext_ids(word, vocab, buckets, minn=3, maxn=6, start='<', end='>'):
    ngrams = get_ngrams(word, minn, maxn, start, end)
    ngram_ids = [vocab.size() + hash(x) % buckets for x in ngrams]
    if vocab.has(word):
        ids = [vocab.id(word)] + ngram_ids
    else:
        ids = ngram_ids
    return ids
 def id(self, word):
   """Returns the integer word id of a word string."""
   if word in self.vocab:
     return self.vocab[word]
   else:
     if not self._buckets:
       return self._unk_id
     else:
       return self.size() + gezi.hash(word) % self._buckets
Beispiel #3
0
def get_ngrams_hash(input,
                    buckets,
                    minn=3,
                    maxn=6,
                    start='<',
                    end='>',
                    reserve=0):
    ngrams = get_ngrams(input, minn, maxn, start, end)
    #print(ngrams)
    ngrams = [reserve + hash(x) % buckets for x in ngrams]
    return ngrams
Beispiel #4
0
def loggest_match(cns,
                  vocab,
                  encode_unk=False,
                  unk_vocab_size=None,
                  vocab_size=None):
    len_ = len(cns)
    for i in range(len_):
        w = ''.join(cns[:len_ - i])
        #for compat with c++ vocabulary
        if vocab.has(w):
            return vocab.id(w), cns[len_ - i:]
        elif unk_vocab_size:
            return gezi.hash(w) % unk_vocab_size + vocab_size, cns[len_ - i:]
    if encode_unk:
        return vocab.unk_id(), cns[1:]
    else:
        return -1, cns[1:]
Beispiel #5
0
def main(_):
  num_conflicts = 0
  visited = {}
  visited_ngram = {}
  ngram_vocab_path = FLAGS.ngram_vocab or os.path.join(FLAGS.dir, 'ngram_vocab.txt')
  ngram_vocab = Vocabulary(ngram_vocab_path)
  print('ngram_vocab size', ngram_vocab.size())
  print('num ngram buckets', FLAGS.ngram_buckets)
  if FLAGS.emb.endswith('.npy'):
    ngram_emb = np.load(FLAGS.emb)
    assert len(ngram_emb) > 100000
  else:
    ngram_emb = []
    for line in open(FLAGS.emb):
      ngram_emb.append([float(x) for x in line.strip().split()])
  print('len ngram emb', len(ngram_emb))
  emb_mat = []
  vec_size = FLAGS.emb_dim
  # for padding zero
  emb_mat.append(np.array([0.] * vec_size))
  # exclude first pad and last 3 unk <s> </s>
  # unk, <s>, </s>, sincie ngram vocab txt not include these will append 
  for i in range(3):
    emb_mat.append([np.random.uniform(-0.08, 0.08) for _ in range(vec_size)])

  for i in range(4, ngram_vocab.size()):
    ngram = ngram_vocab.key(i)
    ngram_hash = gezi.hash(ngram)
    ngram_id = ngram_hash % FLAGS.ngram_buckets
    if ngram_id not in visited:
      visited[ngram_id] = 1
      visited_ngram[ngram_id] = [ngram]
    else:
      visited[ngram_id] += 1
      visited_ngram[ngram_id].append(ngram)
      num_conflicts += 1
      #print('Conflict', visited_ngram[ngram_id], 'Num conflicts', num_conflicts)
    emb_mat.append(ngram_emb[ngram_id])
  print('Num conflicts', num_conflicts)

  print('len(emb_mat)', len(emb_mat))
  ngram_output = FLAGS.ngram_output or 'ngram.npy'
  out_mat = os.path.join(FLAGS.dir, ngram_output)
  print('out mat', out_mat)
  np.save(out_mat, np.array(emb_mat))
def words2ids(words,
              feed_single=True,
              allow_all_zero=False,
              pad=True,
              append_start=False,
              append_end=False,
              max_words=None,
              norm_digit=True,
              norm_all_digit=False,
              multi_grid=None,
              encode_unk=None,
              feed_single_en=False,
              digit_to_chars=False,
              unk_vocab_size=None):
    """
  default params is suitable for bow
  for sequence method may need seg_method prhase and feed_single=True,
  @TODO feed_single is for simplicity, the best strategy may be try to use one level lower words
  like new-word -> phrase -> basic -> single cn

  #@TODO feed_single move to Segmentor.py to add support for seg with vocab 
  norm_all_digit is not used mostly, since you can control this behavior when gen vocab 
  """
    multi_grid = multi_grid or MULTI_GRID
    encode_unk = encode_unk or ENCODE_UNK

    new_words = []
    if not feed_single:
        word_ids = [
            get_id(word, unk_vocab_size) for word in words
            if vocab.has(word) or encode_unk
        ]
    else:
        word_ids = []
        for word in words:
            if digit_to_chars and any(char.isdigit() for char in word):
                for w in word:
                    if not vocab.has(w) and unk_vocab_size:
                        word_ids.append(
                            gezi.hash(w) % unk_vocab_size + vocab.size())
                        new_words.append(w)
                    else:
                        if vocab.has(w) or encode_unk:
                            word_ids.append(vocab.id(w))
                            new_words.append(w)
                continue
            elif norm_all_digit and word.isdigit():
                word_ids.append(vocab.id(NUM_MARK))
                new_words.append(word)
                continue
            if vocab.has(word):
                word_ids.append(vocab.id(word))
                new_words.append(word)
            elif not norm_all_digit and norm_digit and word.isdigit():
                word_ids.append(vocab.id(NUM_MARK))
                new_words.append(word)
            else:
                #TODO might use trie to speed up longest match segment
                if (not multi_grid) or feed_single_en:
                    if not feed_single_en:
                        chars = gezi.get_single_cns(word)
                    else:
                        chars = word
                    if chars:
                        for w in chars:
                            if not vocab.has(w) and unk_vocab_size:
                                word_ids.append(
                                    gezi.hash(w) % unk_vocab_size +
                                    vocab.size())
                                new_words.append(w)
                            else:
                                if vocab.has(w) or encode_unk:
                                    word_ids.append(vocab.id(w))
                                    new_words.append(w)
                    else:
                        if unk_vocab_size:
                            word_ids.append(
                                gezi.hash(word) % unk_vocab_size +
                                vocab.size())
                            new_words.append(word)
                        else:
                            if encode_unk:
                                word_ids.append(vocab.unk_id())
                                new_words.append(word)
                else:
                    #test it!  print text2ids.ids2text(text2ids.text2ids('匍匐前进'))
                    word_ids += gezi.loggest_match_seg(
                        word,
                        vocab,
                        encode_unk=encode_unk,
                        unk_vocab_size=unk_vocab_size,
                        vocab_size=vocab.size())
                    # NOTICE new_words lost here!

    if append_start:
        word_ids = [vocab.start_id()] + word_ids

    if append_end:
        word_ids = word_ids + [vocab.end_id()]

    if not allow_all_zero and not word_ids:
        word_ids.append(vocab.end_id())

    if pad:
        word_ids = gezi.pad(word_ids, max_words or TEXT_MAX_WORDS, 0)

    return word_ids, new_words
def get_id(word, unk_vocab_size=None):
    if unk_vocab_size and not vocab.has(word):
        return gezi.hash(word) % unk_vocab_size + vocab.size()
    return vocab.id(word)
Beispiel #8
0
def main(data_dir):
  input_dir = '../input'
  
  input_files = [f for f in glob.glob('%s/%s/*.png' % (input_dir, 'train_images')) if int(gezi.hash(os.path.basename(f)[:-4])) % NUM_FOLDS != 0]
  print('train:', len(input_files))
  output_file = os.path.join(data_dir, 'train.tfrecords')
  convert_to_tfrecord(input_files, output_file)

  input_files = [f for f in glob.glob('%s/%s/*.png' % (input_dir, 'train_images')) if int(gezi.hash(os.path.basename(f)[:-4])) % NUM_FOLDS == 0]
  print('valid:', len(input_files))
  output_file = os.path.join(data_dir, 'valid.tfrecords')
  convert_to_tfrecord(input_files, output_file) 

  input_files = glob.glob('%s/%s/*.png' % (input_dir, 'test_images'))
  output_file = os.path.join(data_dir, 'test.tfrecords')
  convert_to_tfrecord(input_files, output_file)