def read_lines(self, fnames): '''Read single lines from data''' for fname in fnames: with fname.open('r') as f: for line in f: yield self.vocab.lookup([ w for w in utils.read_words(line, chars=cfg.char_model) ])
def text_file(): words = read_words(config.words_file) words = list(set(words)) pbar = tqdm(words) for word in pbar: res = package_card(word) if not res and config.failed_words_file: with open(config.failed_words_file, mode="a", encoding="utf-8") as f: f.write(word + "\n") col.close()
def train(): result_subdir = create_result_subdir(result_dir) real_words = read_words(real_words_path) fake_words = read_words(fake_words_path) real_words = [ word for word in [convert_to_char_seq(word) for word in real_words] if word != [] ] fake_words = [ word for word in [convert_to_char_seq(word) for word in fake_words] if word != [] ] words = real_words + fake_words words = pad_words(words) words = np.array(words)[:, :, np.newaxis] print(words.shape) labels = np.concatenate( [np.ones(len(real_words)), np.zeros(len(fake_words))]) words_train, words_val, labels_train, labels_val = train_test_split( words, labels, test_size=0.2, random_state=42) model = simple_model() opt = Adam(0.01) model.compile(loss=binary_crossentropy, optimizer=opt, metrics=[binary_accuracy]) model.summary() checkpoint = ModelCheckpoint(os.path.join( result_subdir, 'model.{epoch:03d}-{val_loss:.2f}.h5'), monitor='val_loss') model.fit(words_train, labels_train, batch_size=32, epochs=10, verbose=1, validation_data=(words_val, labels_val), callbacks=[])
def load_by_parsing(self, save=False, verbose=True): '''Read the vocab from the dataset''' if verbose: print('Loading vocabulary by parsing...') fnames = Path(cfg.data_path).glob('*.txt') for fname in fnames: if verbose: print(fname) with fname.open('r') as f: for line in f: for word in utils.read_words(line, chars=cfg.char_model): if word not in self.vocab_lookup: self.vocab_lookup[word] = len(self.vocab) self.vocab.append(word) if verbose: print('Vocabulary loaded, size:', len(self.vocab))
conf_file = "" for o, a in opts: if o == "-c": conf_file = a if conf_file == "": usage() sys.exit(2) props = utils.read_properties(conf_file) if not "WORDLIST" in props: sys.stderr.write("[ERROR] WORDLIST file not defined\n") sys.exit(2) words = utils.read_words(props["WORDLIST"]) results_dir = os.getcwd() if "RESULTSDIR" in props: results_dir = props["RESULTSDIR"] if "CLASSIFIER" not in props: sys.stderr.write("[ERROR] Incorrect CLASSIFIER\n") sys.exit(2) class_name = props["CLASSIFIER"] if not class_name in cl_type: sys.stderr.write("[ERROR] Classifier type not defined\n") sys.stderr.write("\tAvailable classifiers " + cl_type) sys.exit(2)
def train(gpu, args): rank = args.nr * args.gpus + gpu dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank) torch.manual_seed(0) words = read_words( '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled', seq_len, kernel[0]) word_counter = collections.Counter(words).most_common(vocab_size - 1) vocab = [w for w, _ in word_counter] w2i = dict((w, i) for i, w in enumerate(vocab, 1)) w2i['<unk>'] = 0 print('vocab_size', vocab_size) print('w2i size', len(w2i)) data = [w2i[w] if w in w2i else 0 for w in words] data = create_batches(data, batch_size, seq_len) split_idx = int(len(data) * 0.8) training_data = data[:split_idx] test_data = data[split_idx:] print('train samples:', len(training_data)) print('test samples:', len(test_data)) model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs, res_block_count, vocab_size) torch.cuda.set_device(gpu) model.cuda(gpu) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(gpu) optimizer = torch.optim.SGD(model.parameters(), 1e-4) # Wrap the model model = nn.parallel.DataParallel(model, device_ids=[gpu]) print("model transfered") optimizer = torch.optim.Adadelta(model.parameters()) loss_fn = nn.NLLLoss() # Data loading code train_sampler = torch.utils.data.distributed.DistributedSampler( training_data, num_replicas=args.world_size, rank=rank) train_loader = torch.utils.data.DataLoader(dataset=training_data, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) start = datetime.now() total_step = len(train_loader) print("loaded") for epoch in range(args.epochs): a = time.time() print('----epoch', epoch) # random.shuffle(data) # print(len(data)) for batch_ct, (X, Y) in enumerate(train_loader): X = to_var(torch.LongTensor(X)) # (bs, seq_len) Y = to_var(torch.LongTensor(Y)) # (bs,) # print(X.size(), Y.size()) # print(X) # print(batch_ct, X.size(), Y.size()) pred = model(X) # (bs, ans_size) # _, pred_ids = torch.max(pred, 1) loss = loss_fn(pred, Y) if batch_ct % 100 == 0: print('loss: {:.4f}'.format(loss.data.item())) optimizer.zero_grad() loss.backward() optimizer.step() b = time.time() print('current performance at epoch', epoch, "time:", b - a) if gpu == 0: print("Training complete in: " + str(datetime.now() - start))
# device = torch.device('cuda' if args.cuda else 'cpu') # mp.set_start_method('spawn') # distributed_mode = True # gpu_devices = ','.join([str(id) for id in world_size]) # os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices # os.environ['MASTER_ADDR'] = '127.0.0.1' # os.environ['MASTER_PORT'] = '5446' # dist.init_process_group(backend='nccl',init_method='env://', world_size=world_size, rank=rank) # world_size (int, optional) – Number of processes participating in the job # init_method (str, optional) – URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified. Mutually exclusive with store. # setup() words = read_words( '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled', seq_len, kernel[0]) word_counter = collections.Counter(words).most_common(vocab_size - 1) vocab = [w for w, _ in word_counter] w2i = dict((w, i) for i, w in enumerate(vocab, 1)) w2i['<unk>'] = 0 print('vocab_size', vocab_size) print('w2i size', len(w2i)) data = [w2i[w] if w in w2i else 0 for w in words] data = create_batches(data, batch_size, seq_len) split_idx = int(len(data) * 0.8) training_data = data[:split_idx] test_data = data[split_idx:] print('train samples:', len(training_data)) print('test samples:', len(test_data))
import torch import torch.nn as nn from utils import read_words, create_batches, to_var from gated_cnn import GatedCNN vocab_size = 2000 seq_len = 21 embd_size = 200 n_layers = 10 kernel = (5, embd_size) out_chs = 64 res_block_count = 5 batch_size = 64 words = read_words('./data', seq_len, kernel[0]) word_counter = collections.Counter(words).most_common(vocab_size - 1) vocab = [w for w, _ in word_counter] w2i = dict((w, i) for i, w in enumerate(vocab, 1)) w2i['<unk>'] = 0 print('vocab_size', vocab_size) print('w2i size', len(w2i)) data = [w2i[w] if w in w2i else 0 for w in words] data = create_batches(data, batch_size, seq_len) split_idx = int(len(data) * 0.8) training_data = data[:split_idx] test_data = data[split_idx:] print('train samples:', len(training_data)) print('test samples:', len(test_data))