def fasttext_ids(word, vocab, buckets, minn=3, maxn=6, start='<', end='>'): ngrams = get_ngrams(word, minn, maxn, start, end) ngram_ids = [vocab.size() + hash(x) % buckets for x in ngrams] if vocab.has(word): ids = [vocab.id(word)] + ngram_ids else: ids = ngram_ids return ids
def id(self, word): """Returns the integer word id of a word string.""" if word in self.vocab: return self.vocab[word] else: if not self._buckets: return self._unk_id else: return self.size() + gezi.hash(word) % self._buckets
def get_ngrams_hash(input, buckets, minn=3, maxn=6, start='<', end='>', reserve=0): ngrams = get_ngrams(input, minn, maxn, start, end) #print(ngrams) ngrams = [reserve + hash(x) % buckets for x in ngrams] return ngrams
def loggest_match(cns, vocab, encode_unk=False, unk_vocab_size=None, vocab_size=None): len_ = len(cns) for i in range(len_): w = ''.join(cns[:len_ - i]) #for compat with c++ vocabulary if vocab.has(w): return vocab.id(w), cns[len_ - i:] elif unk_vocab_size: return gezi.hash(w) % unk_vocab_size + vocab_size, cns[len_ - i:] if encode_unk: return vocab.unk_id(), cns[1:] else: return -1, cns[1:]
def main(_): num_conflicts = 0 visited = {} visited_ngram = {} ngram_vocab_path = FLAGS.ngram_vocab or os.path.join(FLAGS.dir, 'ngram_vocab.txt') ngram_vocab = Vocabulary(ngram_vocab_path) print('ngram_vocab size', ngram_vocab.size()) print('num ngram buckets', FLAGS.ngram_buckets) if FLAGS.emb.endswith('.npy'): ngram_emb = np.load(FLAGS.emb) assert len(ngram_emb) > 100000 else: ngram_emb = [] for line in open(FLAGS.emb): ngram_emb.append([float(x) for x in line.strip().split()]) print('len ngram emb', len(ngram_emb)) emb_mat = [] vec_size = FLAGS.emb_dim # for padding zero emb_mat.append(np.array([0.] * vec_size)) # exclude first pad and last 3 unk <s> </s> # unk, <s>, </s>, sincie ngram vocab txt not include these will append for i in range(3): emb_mat.append([np.random.uniform(-0.08, 0.08) for _ in range(vec_size)]) for i in range(4, ngram_vocab.size()): ngram = ngram_vocab.key(i) ngram_hash = gezi.hash(ngram) ngram_id = ngram_hash % FLAGS.ngram_buckets if ngram_id not in visited: visited[ngram_id] = 1 visited_ngram[ngram_id] = [ngram] else: visited[ngram_id] += 1 visited_ngram[ngram_id].append(ngram) num_conflicts += 1 #print('Conflict', visited_ngram[ngram_id], 'Num conflicts', num_conflicts) emb_mat.append(ngram_emb[ngram_id]) print('Num conflicts', num_conflicts) print('len(emb_mat)', len(emb_mat)) ngram_output = FLAGS.ngram_output or 'ngram.npy' out_mat = os.path.join(FLAGS.dir, ngram_output) print('out mat', out_mat) np.save(out_mat, np.array(emb_mat))
def words2ids(words, feed_single=True, allow_all_zero=False, pad=True, append_start=False, append_end=False, max_words=None, norm_digit=True, norm_all_digit=False, multi_grid=None, encode_unk=None, feed_single_en=False, digit_to_chars=False, unk_vocab_size=None): """ default params is suitable for bow for sequence method may need seg_method prhase and feed_single=True, @TODO feed_single is for simplicity, the best strategy may be try to use one level lower words like new-word -> phrase -> basic -> single cn #@TODO feed_single move to Segmentor.py to add support for seg with vocab norm_all_digit is not used mostly, since you can control this behavior when gen vocab """ multi_grid = multi_grid or MULTI_GRID encode_unk = encode_unk or ENCODE_UNK new_words = [] if not feed_single: word_ids = [ get_id(word, unk_vocab_size) for word in words if vocab.has(word) or encode_unk ] else: word_ids = [] for word in words: if digit_to_chars and any(char.isdigit() for char in word): for w in word: if not vocab.has(w) and unk_vocab_size: word_ids.append( gezi.hash(w) % unk_vocab_size + vocab.size()) new_words.append(w) else: if vocab.has(w) or encode_unk: word_ids.append(vocab.id(w)) new_words.append(w) continue elif norm_all_digit and word.isdigit(): word_ids.append(vocab.id(NUM_MARK)) new_words.append(word) continue if vocab.has(word): word_ids.append(vocab.id(word)) new_words.append(word) elif not norm_all_digit and norm_digit and word.isdigit(): word_ids.append(vocab.id(NUM_MARK)) new_words.append(word) else: #TODO might use trie to speed up longest match segment if (not multi_grid) or feed_single_en: if not feed_single_en: chars = gezi.get_single_cns(word) else: chars = word if chars: for w in chars: if not vocab.has(w) and unk_vocab_size: word_ids.append( gezi.hash(w) % unk_vocab_size + vocab.size()) new_words.append(w) else: if vocab.has(w) or encode_unk: word_ids.append(vocab.id(w)) new_words.append(w) else: if unk_vocab_size: word_ids.append( gezi.hash(word) % unk_vocab_size + vocab.size()) new_words.append(word) else: if encode_unk: word_ids.append(vocab.unk_id()) new_words.append(word) else: #test it! print text2ids.ids2text(text2ids.text2ids('匍匐前进')) word_ids += gezi.loggest_match_seg( word, vocab, encode_unk=encode_unk, unk_vocab_size=unk_vocab_size, vocab_size=vocab.size()) # NOTICE new_words lost here! if append_start: word_ids = [vocab.start_id()] + word_ids if append_end: word_ids = word_ids + [vocab.end_id()] if not allow_all_zero and not word_ids: word_ids.append(vocab.end_id()) if pad: word_ids = gezi.pad(word_ids, max_words or TEXT_MAX_WORDS, 0) return word_ids, new_words
def get_id(word, unk_vocab_size=None): if unk_vocab_size and not vocab.has(word): return gezi.hash(word) % unk_vocab_size + vocab.size() return vocab.id(word)
def main(data_dir): input_dir = '../input' input_files = [f for f in glob.glob('%s/%s/*.png' % (input_dir, 'train_images')) if int(gezi.hash(os.path.basename(f)[:-4])) % NUM_FOLDS != 0] print('train:', len(input_files)) output_file = os.path.join(data_dir, 'train.tfrecords') convert_to_tfrecord(input_files, output_file) input_files = [f for f in glob.glob('%s/%s/*.png' % (input_dir, 'train_images')) if int(gezi.hash(os.path.basename(f)[:-4])) % NUM_FOLDS == 0] print('valid:', len(input_files)) output_file = os.path.join(data_dir, 'valid.tfrecords') convert_to_tfrecord(input_files, output_file) input_files = glob.glob('%s/%s/*.png' % (input_dir, 'test_images')) output_file = os.path.join(data_dir, 'test.tfrecords') convert_to_tfrecord(input_files, output_file)