set_tf_log_level(args.tf_ll) feature_desc = { 'word': { 'vectorizer': baseline.Token1DVectorizer(mxlen=100, transform_fn=baseline.lowercase), 'embed': { 'file': args.embeddings, 'type': 'default', 'unif': 0.25 } } } vectorizers = {k: v['vectorizer'] for k, v in feature_desc.items()} reader = baseline.TSVSeqLabelReader( vectorizers, clean_fn=baseline.TSVSeqLabelReader.do_clean) train_file = args.train valid_file = args.valid test_file = args.test # This builds a set of counters vocabs, labels = reader.build_vocab([train_file, valid_file, test_file]) # This builds a set of embeddings objects, these are typically not DL-specific # but if they happen to be addons, they can be embeddings = dict() for k, v in feature_desc.items(): embed_config = v['embed'] embeddings_for_k = baseline.embeddings.load_embeddings( 'word',
batch_y = [] dsz = embeddings.get_dsz() ts = reader.load(file, vocabs={'word': embeddings.vocab}, batchsz=batchsz) pg = bl.create_progress_bar(len(ts)) for batch in pg(ts): x = batch['word'] B, T = x.shape flat_x = x.reshape(B * T) dense = embeddings.weights[flat_x] dense = dense.reshape(B, T, dsz) batch_x.append(dense) batch_y.append(batch['y']) return np.stack(batch_x), np.stack(batch_y) reader = bl.TSVSeqLabelReader(VECTORIZERS, clean_fn=bl.TSVSeqLabelReader.do_clean) train_file = os.path.join(BP, TRAIN) valid_file = os.path.join(BP, VALID) test_file = os.path.join(BP, TEST) # This builds a set of counters vocabs, labels = reader.build_vocab([train_file, valid_file, test_file]) print('Writing {}'.format(LABELS)) bl.write_json(labels, LABELS) # This builds a set of embeddings objects, these are typically not DL-specific # but if they happen to be addons, they can be embeddings = bl.PretrainedEmbeddingsModel(W2V_GN_300, known_vocab=vocabs['word'], embed_type='default', unif=0.)