Ejemplo n.º 1
0
 def read_lines(self, fnames):
     '''Read single lines from data'''
     for fname in fnames:
         with fname.open('r') as f:
             for line in f:
                 yield self.vocab.lookup([
                     w for w in utils.read_words(line, chars=cfg.char_model)
                 ])
Ejemplo n.º 2
0
def text_file():
    words = read_words(config.words_file)
    words = list(set(words))
    pbar = tqdm(words)

    for word in pbar:
        res = package_card(word)
        if not res and config.failed_words_file:
            with open(config.failed_words_file, mode="a", encoding="utf-8") as f:
                f.write(word + "\n")
    col.close()
Ejemplo n.º 3
0
def train():
    result_subdir = create_result_subdir(result_dir)
    real_words = read_words(real_words_path)
    fake_words = read_words(fake_words_path)
    real_words = [
        word for word in [convert_to_char_seq(word) for word in real_words]
        if word != []
    ]
    fake_words = [
        word for word in [convert_to_char_seq(word) for word in fake_words]
        if word != []
    ]
    words = real_words + fake_words
    words = pad_words(words)
    words = np.array(words)[:, :, np.newaxis]
    print(words.shape)
    labels = np.concatenate(
        [np.ones(len(real_words)),
         np.zeros(len(fake_words))])
    words_train, words_val, labels_train, labels_val = train_test_split(
        words, labels, test_size=0.2, random_state=42)

    model = simple_model()
    opt = Adam(0.01)
    model.compile(loss=binary_crossentropy,
                  optimizer=opt,
                  metrics=[binary_accuracy])
    model.summary()

    checkpoint = ModelCheckpoint(os.path.join(
        result_subdir, 'model.{epoch:03d}-{val_loss:.2f}.h5'),
                                 monitor='val_loss')

    model.fit(words_train,
              labels_train,
              batch_size=32,
              epochs=10,
              verbose=1,
              validation_data=(words_val, labels_val),
              callbacks=[])
Ejemplo n.º 4
0
 def load_by_parsing(self, save=False, verbose=True):
     '''Read the vocab from the dataset'''
     if verbose:
         print('Loading vocabulary by parsing...')
     fnames = Path(cfg.data_path).glob('*.txt')
     for fname in fnames:
         if verbose:
             print(fname)
         with fname.open('r') as f:
             for line in f:
                 for word in utils.read_words(line, chars=cfg.char_model):
                     if word not in self.vocab_lookup:
                         self.vocab_lookup[word] = len(self.vocab)
                         self.vocab.append(word)
     if verbose:
         print('Vocabulary loaded, size:', len(self.vocab))
Ejemplo n.º 5
0
    conf_file = ""
    for o, a in opts:
        if o == "-c":
            conf_file = a

    if conf_file == "":
        usage()
        sys.exit(2)
    props = utils.read_properties(conf_file)    


    if not "WORDLIST" in props:
        sys.stderr.write("[ERROR] WORDLIST file not defined\n")
        sys.exit(2)
    words = utils.read_words(props["WORDLIST"])

    results_dir = os.getcwd()
    if "RESULTSDIR" in props:
        results_dir = props["RESULTSDIR"]

    if "CLASSIFIER" not in props:
        sys.stderr.write("[ERROR] Incorrect CLASSIFIER\n")
        sys.exit(2)    
    class_name = props["CLASSIFIER"]

    if not class_name in cl_type:
        sys.stderr.write("[ERROR] Classifier type not defined\n")
        sys.stderr.write("\tAvailable classifiers " + cl_type)
        sys.exit(2)
Ejemplo n.º 6
0
def train(gpu, args):
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.world_size,
                            rank=rank)
    torch.manual_seed(0)

    words = read_words(
        '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
        seq_len, kernel[0])
    word_counter = collections.Counter(words).most_common(vocab_size - 1)
    vocab = [w for w, _ in word_counter]
    w2i = dict((w, i) for i, w in enumerate(vocab, 1))
    w2i['<unk>'] = 0
    print('vocab_size', vocab_size)
    print('w2i size', len(w2i))

    data = [w2i[w] if w in w2i else 0 for w in words]
    data = create_batches(data, batch_size, seq_len)
    split_idx = int(len(data) * 0.8)
    training_data = data[:split_idx]
    test_data = data[split_idx:]
    print('train samples:', len(training_data))
    print('test samples:', len(test_data))

    model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                     res_block_count, vocab_size)

    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DataParallel(model, device_ids=[gpu])
    print("model transfered")

    optimizer = torch.optim.Adadelta(model.parameters())
    loss_fn = nn.NLLLoss()
    # Data loading code

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        training_data, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=training_data,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)

    print("loaded")
    for epoch in range(args.epochs):
        a = time.time()
        print('----epoch', epoch)
        # random.shuffle(data)
        # print(len(data))
        for batch_ct, (X, Y) in enumerate(train_loader):
            X = to_var(torch.LongTensor(X))  # (bs, seq_len)
            Y = to_var(torch.LongTensor(Y))  # (bs,)
            # print(X.size(), Y.size())
            # print(X)
            # print(batch_ct, X.size(), Y.size())
            pred = model(X)  # (bs, ans_size)
            # _, pred_ids = torch.max(pred, 1)
            loss = loss_fn(pred, Y)
            if batch_ct % 100 == 0:
                print('loss: {:.4f}'.format(loss.data.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        b = time.time()
        print('current performance at epoch', epoch, "time:", b - a)

    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
Ejemplo n.º 7
0
    # device = torch.device('cuda' if args.cuda else 'cpu')
    # mp.set_start_method('spawn')
    # distributed_mode = True

    # gpu_devices = ','.join([str(id) for id in world_size])
    # os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
    # os.environ['MASTER_ADDR'] = '127.0.0.1'
    # os.environ['MASTER_PORT'] = '5446'
    # dist.init_process_group(backend='nccl',init_method='env://', world_size=world_size, rank=rank)

    # world_size (int, optional) – Number of processes participating in the job
    # init_method (str, optional) – URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified. Mutually exclusive with store.
    # setup()

    words = read_words(
        '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
        seq_len, kernel[0])
    word_counter = collections.Counter(words).most_common(vocab_size - 1)
    vocab = [w for w, _ in word_counter]
    w2i = dict((w, i) for i, w in enumerate(vocab, 1))
    w2i['<unk>'] = 0
    print('vocab_size', vocab_size)
    print('w2i size', len(w2i))

    data = [w2i[w] if w in w2i else 0 for w in words]
    data = create_batches(data, batch_size, seq_len)
    split_idx = int(len(data) * 0.8)
    training_data = data[:split_idx]
    test_data = data[split_idx:]
    print('train samples:', len(training_data))
    print('test samples:', len(test_data))
Ejemplo n.º 8
0
import torch
import torch.nn as nn

from utils import read_words, create_batches, to_var
from gated_cnn import GatedCNN

vocab_size = 2000
seq_len = 21
embd_size = 200
n_layers = 10
kernel = (5, embd_size)
out_chs = 64
res_block_count = 5
batch_size = 64

words = read_words('./data', seq_len, kernel[0])
word_counter = collections.Counter(words).most_common(vocab_size - 1)
vocab = [w for w, _ in word_counter]
w2i = dict((w, i) for i, w in enumerate(vocab, 1))
w2i['<unk>'] = 0
print('vocab_size', vocab_size)
print('w2i size', len(w2i))

data = [w2i[w] if w in w2i else 0 for w in words]
data = create_batches(data, batch_size, seq_len)
split_idx = int(len(data) * 0.8)
training_data = data[:split_idx]
test_data = data[split_idx:]
print('train samples:', len(training_data))
print('test samples:', len(test_data))