parser.add_argument('--transformer', help='Are we using a Transformer (default is LSTM) LM', type=str2bool, default=False) parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") args = parser.parse_known_args()[0] embed_type = 'learned-positional' if args.transformer else 'default' feature_desc = { 'word': { 'vectorizer': baseline.Token1DVectorizer(mxlen=-1, transform_fn=baseline.lowercase), 'embed': { 'embed_file': args.embeddings, 'embed_type': embed_type, 'unif': 0.05 } } } class DictionaryDatasetWrapper(Dataset): def __init__(self, x, y): self.tensor_dataset = TensorDataset(x, y) def __getitem__(self, index): # stuff
pool_field = 'cmotsz' if args.model_type == 'default' else 'rnnsz' model_params = { 'model_type': args.model_type, 'filtsz': args.filts, pool_field: args.poolsz } if args.stacksz is not None: model_params['hsz'] = args.stacksz feature_desc = { 'word': { 'vectorizer': bl.Token1DVectorizer(mxlen=args.mxlen, transform_fn=bl.lowercase), 'embed': {'file': args.embeddings, 'type': 'default', 'unif': 0.25} } } # Create a reader that is using our vectorizers to parse a TSV file # with rows like: # <label>\t<sentence>\n vectorizers = {k: v['vectorizer'] for k, v in feature_desc.items()} reader = bl.TSVSeqLabelReader(vectorizers, clean_fn=bl.TSVSeqLabelReader.do_clean) train_file = args.train valid_file = args.valid test_file = args.test
parser.add_argument('--mxlen', help='Maximum post length (number of words) during training', type=int, default=100) parser.add_argument('--train', help='Training file', default='../data/stsa.binary.phrases.train') parser.add_argument('--valid', help='Validation file', default='../data/stsa.binary.dev') parser.add_argument('--test', help='Testing file', default='../data/stsa.binary.test') parser.add_argument('--embeddings', help='Pretrained embeddings file', default='/data/embeddings/GoogleNews-vectors-negative300.bin') parser.add_argument('--ll', help='Log level', type=str, default='info') parser.add_argument('--lr', help='Learning rate', type=float, default=0.001) parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") args = parser.parse_known_args()[0] feature_desc = { 'word': { 'vectorizer': baseline.Token1DVectorizer(mxlen=100, transform_fn=baseline.lowercase), 'embed': {'file': args.embeddings, 'type': 'default', 'unif': 0.25} } } # Create a reader that is using our vectorizers to parse a TSV file # with rows like: # <label>\t<sentence>\n class DictionaryDatasetWrapper(Dataset): def __init__(self, x, x_lengths, y): self.tensor_dataset = TensorDataset(x, x_lengths, y) def __getitem__(self, index): # stuff x, x_length, y = self.tensor_dataset[index] return {'word': x.to(args.device), "lengths": x_length.to(args.device)}, y.to(args.device)
import baseline as bl import argparse import os import numpy as np """Take a file of TSVs where format is `label<\t>content` and convert to NPZ file of vectors from pretrained embeddings """ BP = '../data' TRAIN = 'stsa.binary.phrases.train'.format(BP) VALID = 'stsa.binary.dev' TEST = 'stsa.binary.test' LABELS = os.path.join(BP, 'stsa.binary.labels') W2V_GN_300 = '/data/embeddings/GoogleNews-vectors-negative300.bin' VECTORIZERS = {'word': bl.Token1DVectorizer(mxlen=40)} def output_file(input_file): return input_file + '.npz' def convert_input(file, embeddings, batchsz=50): batch_x = [] batch_y = [] dsz = embeddings.get_dsz() ts = reader.load(file, vocabs={'word': embeddings.vocab}, batchsz=batchsz) pg = bl.create_progress_bar(len(ts)) for batch in pg(ts): x = batch['word'] B, T = x.shape flat_x = x.reshape(B * T)