Example #1
0
def main():
  print('Invoked as:', ' '.join(sys.argv), file=sys.stderr)
  parser = argparse.ArgumentParser()
  parser.add_argument('model')
  parser.add_argument('train_corpus')
  parser.add_argument('test_corpus')
  parser.add_argument('--layers', type=int, default=1)
  parser.add_argument('--hidden_dim', type=int, default=128)
  parser.add_argument('--minibatch_size', type=int, default=1)
  parser.add_argument('--autobatch', action='store_true')
  parser.add_argument('--tied', action='store_true')
  parser.add_argument('--residual', action='store_true')
  parser.add_argument('--sent_level', action='store_true')
  args = parser.parse_args()

  vocab = Vocabulary()
  train_corpus = read_corpus(args.train_corpus, vocab)
  vocab.frozen = True
  test_corpus = read_corpus(args.test_corpus, vocab) 

  print('Vocabulary size:', len(vocab), file=sys.stderr)

  pc = dy.ParameterCollection()
  model = TopDownDepLM(pc, vocab, args.layers, args.hidden_dim, args.hidden_dim, args.tied, args.residual)
  pc.populate_from_textfile(args.model)
  print('Total parameters:', pc.parameter_count(), file=sys.stderr)

  run_test_set(model, test_corpus, args)
Example #2
0
def load_dataset(path):
    charset = Charset()

    vocab = Vocabulary()
    vocab.load(f"{path}/vocab.txt")

    tag_set = Index()
    tag_set.load(f"{path}/tag2id.txt")

    measure_type = get_measure_type(path)

    tag_set = Index()
    if measure_type == "relations":
        tag_set.load(f"{path}/tag2id.txt")
    elif measure_type == "entities":
        tag_set.load(f"{path}/entity_labels.txt")

    helper = Helper(vocab, tag_set, charset, measure_type=measure_type)

    # relation_labels = Index()
    # relation_labels.load(f"{path}/relation_labels.txt")

    train_data = load(f"{path}/train.pk")[:1000]
    test_data = load(f"{path}/test.pk")

    word_embeddings = np.load(f"{path}/word2vec.vectors.npy")

    return helper, word_embeddings, train_data, test_data, tag_set
Example #3
0
def main(_):
    hps = CLSTMDNN.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus

    word_vocab = Vocabulary.from_file(os.path.join(FLAGS.vocabdir, "1b_word_vocab.txt"))
    char_vocab = Vocabulary.from_file(os.path.join(FLAGS.vocabdir, "1b_char_vocab.txt"))

    if FLAGS.mode == "train":
        hps.batch_size = 256
        data_dir = FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*"
        eval_dataset = DatasetCharWord(word_vocab, char_vocab, data_dir, max_word_length=hps.word_length,
                                       deterministic=True)

        dataset = DatasetCharWord(word_vocab, char_vocab, FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*",
                                  max_word_length=hps.word_length)

        run_train(dataset, eval_dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        hps.batch_size = 32
        if FLAGS.mode.startswith("eval_train"):
            data_dir = FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*"
        else:
            data_dir = FLAGS.datadir + "/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
        dataset = DatasetCharWord(word_vocab, char_vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
Example #4
0
def main():
    print('Invoked as:', ' '.join(sys.argv), file=sys.stderr)
    parser = argparse.ArgumentParser()
    parser.add_argument('train_corpus')
    parser.add_argument('dev_corpus')
    parser.add_argument('--layers', type=int, default=1)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser.add_argument('--minibatch_size', type=int, default=1)
    parser.add_argument('--autobatch', action='store_true')
    parser.add_argument('--tied', action='store_true')
    parser.add_argument('--residual', action='store_true')
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--output', type=str, default='')
    harness.add_optimizer_args(parser)
    args = parser.parse_args()

    if args.output == '':
        args.output = '/tmp/model%d' % random.randint(0, 0xFFFF)
    print('Output file:', args.output, file=sys.stderr)

    vocab = Vocabulary()
    train_corpus = read_corpus(args.train_corpus, vocab)
    vocab.frozen = True
    dev_corpus = read_corpus(args.dev_corpus, vocab)

    print('Vocabulary size:', len(vocab), file=sys.stderr)

    pc = dy.ParameterCollection()
    optimizer = harness.make_optimizer(args, pc)
    model = TopDownDepLM(pc, vocab, args.layers, args.hidden_dim,
                         args.hidden_dim, args.tied, args.residual)
    print('Total parameters:', pc.parameter_count(), file=sys.stderr)

    harness.train(model, train_corpus, dev_corpus, optimizer, args)
Example #5
0
 def __init__(self, scene_images_dir, object_images_dir, dataset, vocab):
     self.last_index = 0
     self.vocab = Vocabulary()
     self.vocab.load(vocab)
     self.lemmatizer = WordNetLemmatizer()
     self.dataset = File(dataset, 'r')
     self.questions = self.dataset['questions']
     self.answers = self.dataset['answers']
     self.image_indices = self.dataset['image_indices']
     self.images = self.dataset['images']
     # self.dataset_size = 100000
     self.dataset_size = self.questions.shape[0]
     self.object_images = dict()
     self.object_classes = []
     self.scene_images = dict()
     self.scene_classes = []
     for object_class in listdir(object_images_dir):
         self.object_classes.append(self.lemmatizer.lemmatize(object_class))
     for scene_class in listdir(scene_images_dir):
         self.scene_classes.append(self.lemmatizer.lemmatize(scene_class))
     for object_class in listdir(object_images_dir):
         object_class_dir = join(object_images_dir, object_class)
         self.object_images[object_class] = [
             join(object_class_dir, f) for f in listdir(object_class_dir)
         ]
     for scene_class in listdir(scene_images_dir):
         scene_class_dir = join(scene_images_dir, scene_class)
         self.scene_images[scene_class] = [
             join(scene_class_dir, f) for f in listdir(scene_class_dir)
         ]
Example #6
0
def init_word_embedding(embedding_paths):
    print('Init word embedding from: ', ', '.join(embedding_paths))

    lines = []
    for embedding_path in embedding_paths:
        embedding_path = os.path.join(DIR_PATH, embedding_path)

        with open(embedding_path, encoding='utf-8') as file:
            lines += file.read().strip().split('\n')

    tokens_of_lines = [l.strip().split(' ') for l in lines]
    words = [l[0] for l in tokens_of_lines]
    weight = [[float(str_emb) for str_emb in l[1:]] for l in tokens_of_lines]

    voc = Vocabulary(words)
    print('Vocabulary size:', voc.size())

    # also init the embedding for special tokens
    while len(weight) < voc.size():
        embedding_len = len(weight[0])
        weight.append([0] * embedding_len)

    weight = torch.FloatTensor(weight)

    return voc, weight
def main():
    args = parse_arguments()
    n_vocab = params.n_vocab
    n_layer = params.n_layer
    n_hidden = params.n_hidden
    n_embed = params.n_embed
    n_batch = args.n_batch
    temperature = params.temperature
    train_path = params.train_path
    assert torch.cuda.is_available()

    print("loading_data...")
    # 训练时加载处理好的词典(如果有的话)
    if os.path.exists("vocab.json"):
        vocab = Vocabulary()
        with open('vocab.json', 'r') as fp:
            vocab.stoi = json.load(fp)

        for key, value in vocab.stoi.items():
            vocab.itos.append(key)
    else:
        vocab = build_vocab(train_path, n_vocab)
        # save vocab
        with open('vocab.json', 'w') as fp:
            json.dump(vocab.stoi, fp)

    train_X, train_y, train_K = load_data(train_path, vocab)
    train_loader = get_data_loader(train_X, train_y, train_K, n_batch)
    print("successfully loaded")

    encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda()
    Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer,
                                vocab).cuda()
    manager = Manager(n_hidden, n_vocab, temperature).cuda()
    decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda()

    if args.restore:
        encoder = init_model(encoder, restore=params.encoder_restore)
        Kencoder = init_model(Kencoder, restore=params.Kencoder_restore)
        manager = init_model(manager, restore=params.manager_restore)
        decoder = init_model(decoder, restore=params.decoder_restore)

    # ToDo:目前的代码所有的embedding都是独立的,可以参考transformer源码使用直接赋值的方法共享参数:
    #if emb_src_trg_weight_sharing:
    #   self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight

    model = [encoder, Kencoder, manager, decoder]
    parameters = list(encoder.parameters()) + list(Kencoder.parameters()) + \
                 list(manager.parameters()) + list(decoder.parameters())
    optimizer = optim.Adam(parameters, lr=args.lr)

    # pre_train knowledge manager
    print("start pre-training")
    pre_train(model, optimizer, train_loader, args)
    print("start training")
    train(model, optimizer, train_loader, args)

    # save final model
    save_models(model, params.all_restore)
Example #8
0
def create_vocab(words):

    vocab = Vocabulary()

    for i, w in enumerate(words):
        vocab.add_word(w)

    return vocab
Example #9
0
    def __init__(self):
        self.vocab = Vocabulary()
        self.ans = {}
        for line in open("../data/train_answer.csv"):
            line = line.strip().split(',')
            self.ans[line[0]] = int(line[1])

        print("*** Finish building vocabulary")
Example #10
0
    def __init__(self):
        self.vocab = Vocabulary()
        self.ans = {}
        for line in open(
                "/home/share/liyongqi/ChID/raw_data/train_answer.csv"):
            line = line.strip().split(',')
            self.ans[line[0]] = int(line[1])

        print("*** Finish building vocabulary")
Example #11
0
def load_vocab(vocab_path):
    """Load Vocabulary object from a pickle file.
    Args:
        vocab_path: The location of the vocab pickle file.
    Returns:
        A Vocabulary object.
    """
    vocab = Vocabulary()
    vocab.load(vocab_path)
    return vocab
Example #12
0
    def __init__(self):
        if not os.path.exists('cache'):
            os.makedirs('cache')

        if os.path.exists("cache/vocab.pkl"):
            self.vocab = pickle.load(open("cache/vocab.pkl", "rb"))
        else:
            self.vocab = Vocabulary()
            pickle.dump(self.vocab, open("cache/vocab.pkl", "wb"), protocol=2)

        print("*** Finish building vocabulary")
def build_vocab(word_lst, size=10000):
    vocab = Vocabulary()
    vocab.add_word("<pad>")
    vocab.add_word("<start>")
    vocab.add_word("<end>")
    vocab.add_word("<unk>")

    for word in word_lst[:size-4]:
        vocab.add_word(word)
        
    return vocab
Example #14
0
    def __init__(self):
        self.vocab = Vocabulary()
        self.ans = {}
        for line in open("../data/train_data.txt"):
            #line = line.strip().split(',')
            #self.ans[line[0]] = int(line[1])

            line_json = json.loads(line)
            self.ans[line_json['content']] = int(line_json['realCount']) 

        print("*** Finish building vocabulary")
Example #15
0
def build_vocab(f_corpus: str, f_vocab: str, min_frequency: int, max_len: int):
    """
    Build count-based vocabulary class
    
    Args:
        f_corpus      (str): Corpus file used to extract vocabulary
        f_vocab       (str): Text file to store extracted vocabulary
        min_frequency (int): Word's minimum frequency
        max_len       (int): Maximum sentence length used to zero padding
    
    Returns:
        (Vocabulary) vocabulary class instantiated using 'vocab' file
    """

    vocab = defaultdict(int)

    corpus = open(f_corpus, 'r', encoding='utf-8')
    lines = corpus.readlines()

    for line in lines:
        line = re.sub('[^A-Za-z0-9 ]+', '', line)
        for word in line.lower().split(' '):
            vocab[word] += 1

    vocab = sorted(vocab.items(), key=(lambda x: x[1]), reverse=True)[1:]

    with open(f_vocab, 'w', encoding='utf-8') as f:
        for word in vocab:
            if word[1] >= min_frequency:
                print(f'{word[0]}\t{word[1]}', file=f)

    return lines, Vocabulary(f_vocab, max_len)
Example #16
0
def get_vocab_imdb(data):
    '''
    return: text.vocab.Vocabulary, each word appears at least 5 times.
    '''
    tokenized = tokenize_imdb(data)
    counter = collections.Counter([tk for st in tokenized for tk in st])
    return Vocabulary(counter, min_freq=5)
Example #17
0
def main(unused_argv):
    # extract the vocabulary from training sentendes
    vocabulary = Vocabulary()
    vocabulary.load_file(FLAGS.train_file_path)

    # load training data
    train_loader = DataLoader(FLAGS.train_file_path,
                              FLAGS.data_location,
                              vocabulary,
                              do_shuffle=True)
    batches_train = train_loader.batch_iterator(FLAGS.num_epochs,
                                                FLAGS.batch_size)

    # loop over training batches
    for data_train in batches_train:
        pass
def update_model(model):

    d = model.dimensionality
    if not type(model.vocabulary) == Vocabulary:
        print "Updating vocabulary and We"
        model.vocabulary = Vocabulary(model.vocabulary)
        # Append to We
        unkn_word_embedding = np.random.uniform(-1, 1, size=(1, d))
        model.We = np.append(model.We, unkn_word_embedding, axis=0)

    if not type(model.dependency_dict) == Vocabulary:
        print "Updating dependency_dict and Wr"
        model.dependency_dict = Vocabulary(model.dependency_dict)
        unkn_relation_embedding = np.random.uniform(-1, 1, size=(1, d, d))
        model.Wr = np.append(model.Wr, unkn_relation_embedding, axis=0)

    return model
Example #19
0
def create_vocab(qas, threshold=4):
    counter = Counter()
    for qa in qas:
        question = qa['question'].encode('utf-8')
        answer = qa['answer'].encode('utf-8')
        qtokens = nltk.tokenize.word_tokenize(question.lower())
        atokens = nltk.tokenize.word_tokenize(answer.lower())
        counter.update(qtokens)
        counter.update(atokens)

    # If a word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Adds the words to the vocabulary.
    vocab = Vocabulary()
    for word in words:
        vocab.add_word(word)
    return vocab
def main():
    args = parse_arguments()
    n_vocab = params.n_vocab
    n_layer = params.n_layer
    n_hidden = params.n_hidden
    n_embed = params.n_embed
    n_batch = args.n_batch
    temperature = params.temperature

    test_path = params.test_path
    vocab_path =  params.vocab_path
    assert torch.cuda.is_available()


    print("loading the vocab...")
    vocab = Vocabulary()
    with open(vocab_path, 'r',encoding='utf-8') as fp:
        vocab.stoi = json.load(fp)
    for key, value in vocab.stoi.items():
        vocab.itos.append(key)

    # load data and change to id
    print("loading_data...")
    test_X, test_y, test_K = load_data(test_path, vocab)

    test_loader = get_data_loader(test_X, test_y, test_K, n_batch,False)
    print("successfully loaded test data")

    encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer).cuda()
    Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer).cuda()
    manager = Manager(n_hidden, n_vocab, temperature).cuda()
    decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer).cuda()


    encoder = init_model(encoder, restore=params.encoder_restore)
    Kencoder = init_model(Kencoder, restore=params.Kencoder_restore)
    manager = init_model(manager, restore=params.manager_restore)
    decoder = init_model(decoder, restore=params.decoder_restore)
    print("models successfully loaded!\n")

    model = [encoder, Kencoder, manager, decoder]

    #evaluate_loss(model, 0, test_loader)
    evaluate_sample(model, vocab,test_X, test_y, test_K, test_loader)
Example #21
0
def _main():
    parser = argparse.ArgumentParser(description="Start a ALPR demo server.")
    parser.add_argument("--dims", help="set the sample dimentions (default: 208)", type=int, default=208)
    parser.add_argument("--threshold", help="set the positive threshold (default: 0.9)", type=float, default=0.9)
    parser.add_argument("--plt_w", help="set the max width of output plate images (default: 144)", type=int, default=144)
    parser.add_argument("--plt_h", help="set the max height of output plate images (default: 48)", type=int, default=48)
    parser.add_argument("--seq_len", help="set the max length of output sequences (default: 8)", type=int, default=8)
    parser.add_argument("--beam_size", help="set the size of beam (default: 5)", type=int, default=5)
    parser.add_argument("--addr", help="set address of ALPR server (default: 0.0.0.0)", type=str, default="0.0.0.0")
    parser.add_argument("--port", help="set port of ALPR server (default: 80)", type=int, default=80)
    parser.add_argument("--device_id", help="select device that the model using (default: 0)", type=int, default=0)
    parser.add_argument("--gpu", help="using gpu acceleration", action="store_true")
    args = parser.parse_args()

    if args.gpu:
        context = mx.gpu(args.device_id)
    else:
        context = mx.cpu(args.device_id)

    print("This is ALPR demo server", flush=True)
    wpod = WpodNet()
    wpod.load_parameters("model/wpod_net.params", ctx=context)
    vocab = Vocabulary()
    vocab.load("model/vocabulary.json")
    ocr = OcrNet((args.plt_h, args.plt_w), vocab.size(), args.seq_len)
    ocr.load_parameters("model/ocr_net.params", ctx=context)
    yolo = model_zoo.get_model('yolo3_darknet53_voc', pretrained=True, ctx=context)
    handler = config_handler(
        context = context,
        dims = args.dims,
        threshold = args.threshold,
        plt_hw = (args.plt_h, args.plt_w),
        seq_len = args.seq_len,
        beam_size = args.beam_size,
        wpod = wpod,
        vocab = vocab,
        ocr = ocr,
        yolo = yolo
    )

    httpd = http.server.HTTPServer((args.addr, args.port), handler)
    httpd.serve_forever()
Example #22
0
def main():
    args = parse_arguments()
    n_vocab = params.n_vocab
    n_layer = params.n_layer
    n_hidden = params.n_hidden
    n_embed = params.n_embed
    n_batch = args.n_batch
    temperature = params.temperature
    test_path = params.test_path
    assert torch.cuda.is_available()

    print("loading_data...")

    if os.path.exists("vocab.json"):
        vocab = Vocabulary()
        with open('vocab.json', 'r') as fp:
            vocab.stoi = json.load(fp)

        for key, value in vocab.stoi.items():
            vocab.itos.append(key)
    else:
        train_path = params.train_path
        vocab = build_vocab(train_path, n_vocab)

    test_X, test_y, test_K = load_data(test_path, vocab)
    test_loader = get_data_loader(test_X, test_y, test_K, n_batch)
    print("successfully loaded")

    encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer).cuda()
    Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer).cuda()
    manager = Manager(n_hidden, n_vocab, temperature).cuda()
    decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer).cuda()

    encoder = init_model(encoder, restore=params.encoder_restore)
    Kencoder = init_model(Kencoder, restore=params.Kencoder_restore)
    manager = init_model(manager, restore=params.manager_restore)
    decoder = init_model(decoder, restore=params.decoder_restore)

    model = [encoder, Kencoder, manager, decoder]
    print("start evaluating")
    evaluate(model, test_loader)
Example #23
0
class DataLoaderTest(unittest.TestCase):
    if __name__ == '__main__':
        unittest.main()

    def setUp(self):
        self.voc = Vocabulary()
        self.voc.load_file("./test_sentences.txt")
        self.loader = DataLoader("./test_sentences.txt",
                                 self.voc,
                                 do_shuffle=False)

    def test_loadFileIntoMemory(self):
        # loading data is done in constructor
        assert self.loader.data_num is not None
        assert self.loader.data_num.shape == (6, 30)

    def test_iterate_over_all_epochs_and_batches(self):
        batches = self.loader.batch_iterator(3, 3)
        count = 0

        for i in batches:
            count += 1
            assert i.shape == (3, 30)
        assert count == 6

    def test_each_sentence_has_bos_eos(self):
        assert np.sum(np.equal(
            self.loader.data, Vocabulary.END_SEQ)) == self.loader.data.shape[0]
        assert np.sum(
            np.equal(self.loader.data,
                     Vocabulary.INIT_SEQ)) == self.loader.data.shape[0]

    def test_load_partial_sentence_no_eos(self):
        self.loader = DataLoader("./test_sentences.txt",
                                 self.voc,
                                 do_shuffle=False,
                                 is_partial=True)
        assert np.sum(np.equal(self.loader.data, Vocabulary.END_SEQ)) == 0
        assert np.sum(
            np.equal(self.loader.data,
                     Vocabulary.INIT_SEQ)) == self.loader.data.shape[0]
def create_mpqa():
    mpqa = read_file('raw_datasets/mpqa.all')

    # build matrices
    X, y = [], []
    for line in mpqa:
        words = line.split(' ')
        label = [0,0]
        label[int(line[0])] = 1
        sent = clean_str(line[1:])
        
        X.append(sent)
        y.append(label)

    # build vocab
    mpqa_vocab = Vocabulary(X)
    print('vocab', len(mpqa_vocab.vocab))

    # encode sents
    max_len = compute_avg_len(X) 
    for i in range(len(X)):
        X[i] = encode_sent(X[i].split(' '), mpqa_vocab.encoding, max_len)
    
    # build embeddings
    embeddings = []
    for name, (emb_vocab, emb_vectors) in embeddings_map.items():
        embedding, found = create_embeddings(
            mpqa_vocab, emb_vocab, emb_vectors, 300
        )
        embeddings.append(embedding)
        print('{} - {}'.format(name, found))
    w2v_embeddings, glove_embeddings, nb_embeddings = embeddings

    # shuffle
    X, y = np.array(X), np.array(y)
    indices = np.random.permutation(len(X))
    X, y = X[indices], y[indices]

    split_idx = int(len(X) * 0.9)
    X_train, X_valid = X[:split_idx], X[split_idx:]
    y_train, y_valid = y[:split_idx], y[split_idx:]

    print('train', X_train.shape, y_train.shape)
    print('valid', X_valid.shape, y_valid.shape)

    # save objects
    save_object('datasets/mpqa_train', (X_train, y_train))
    save_object('datasets/mpqa_valid', (X_valid, y_valid))
    save_object('datasets/mpqa_vocab', mpqa_vocab)
    save_object('datasets/mpqa_w2v_embs', w2v_embeddings)
    save_object('datasets/mpqa_glove_embs', glove_embeddings)
    save_object('datasets/mpqa_nb_embs', nb_embeddings)
def create_mr():
    pos = read_file('raw_datasets/rt-polarity.pos')
    neg = read_file('raw_datasets/rt-polarity.neg')

    # build matrices
    X, y = [], []
    for sent in pos:
        X.append(clean_str(sent))
        y.append([0,1])
    for sent in neg:
        X.append(clean_str(sent))
        y.append([1,0])
    
    # build vocab
    mr_vocab = Vocabulary(X)
    print('vocab', len(mr_vocab.vocab))

    # encode sents
    max_seq_len = compute_avg_len(X)
    for i in range(len(X)):
        X[i] = encode_sent(X[i].split(' '), mr_vocab.encoding, max_seq_len)

    # build embeddings
    embeddings = []
    for name, (emb_vocab, emb_vectors) in embeddings_map.items():
        embedding, found = create_embeddings(
            mr_vocab, emb_vocab, emb_vectors, 300
        )
        embeddings.append(embedding)
        print('{} - {}'.format(name, found))
    w2v_embeddings, glove_embeddings, nb_embeddings = embeddings

    # shuffle
    X, y = np.array(X), np.array(y)
    indices = np.random.permutation(len(X))
    X, y = X[indices], y[indices]

    split_idx = int(len(X) * 0.9)
    X_train, X_valid = X[:split_idx], X[split_idx:]
    y_train, y_valid = y[:split_idx], y[split_idx:]

    print('train', X_train.shape, y_train.shape)
    print('valid', X_valid.shape, y_valid.shape)

    # save objects
    save_object('datasets/mr_train', (X_train, y_train))
    save_object('datasets/mr_valid', (X_valid, y_valid))
    save_object('datasets/mr_vocab', mr_vocab)
    save_object('datasets/mr_w2v_embs', w2v_embeddings)
    save_object('datasets/mr_glove_embs', glove_embeddings)
    save_object('datasets/mr_nb_embs', nb_embeddings)
Example #26
0
def test(images, dims, threshold, plt_hw, seq_len, no_yolo, beam, beam_size,
         context):
    print("Loading model...")
    if not no_yolo:
        yolo = model_zoo.get_model('yolo3_darknet53_voc',
                                   pretrained=True,
                                   ctx=context)
    wpod = WpodNet()
    wpod.load_parameters("model/wpod_net.params", ctx=context)
    vocab = Vocabulary()
    vocab.load("model/vocabulary.json")
    ocr = OcrNet(plt_hw, vocab.size(), seq_len)
    ocr.load_parameters("model/ocr_net.params", ctx=context)
    for path in images:
        print(path)
        raw = load_image(path)
        if no_yolo:
            detect_plate(wpod, vocab, ocr, raw, dims, threshold, plt_hw, beam,
                         beam_size, context)
        else:
            ts = time.time()
            x, _ = data.transforms.presets.yolo.transform_test(raw, short=512)
            classes, scores, bboxes = yolo(x.as_in_context(context))
            bboxes[0, :, 0::2] = bboxes[0, :, 0::2] / x.shape[3] * raw.shape[1]
            bboxes[0, :, 1::2] = bboxes[0, :, 1::2] / x.shape[2] * raw.shape[0]
            vehicles = [
                fixed_crop(raw, bboxes[0, i]) for i in range(classes.shape[1])
                if (yolo.classes[int(classes[0, i].asscalar())] == 'car'
                    or yolo.classes[int(classes[0, i].asscalar())] == 'bus')
                and scores[0, i].asscalar() > 0.5
            ]
            print("yolo profiling: %f" % (time.time() - ts))
            for i, raw in enumerate(vehicles):
                print("vehicle[%d]:" % i)
                detect_plate(wpod, vocab, ocr, raw, dims, threshold, plt_hw,
                             beam, beam_size, context)
Example #27
0
def process_corpus(lines: List[str], vocab: Vocabulary, f_output: str):
    """
    Calculate the time spent during one epoch
    
    Args:
        lines       (list): List of string which contains line of corpus
        vocab (Vocabulary): Vocabulary class instantiated using 'vocab' file
        f_output   (float): Text file to store processed corpus
    
    Returns:
        None
    """

    with open(f_output, 'w', encoding='utf-8') as f:
        for line in lines:
            line = re.sub('[^A-Za-z0-9 ]+', '', line)
            print(vocab.encode(line), file=f)
Example #28
0
def main():

    train_dataframe, valid_dataframe = make_train_valid_dfs()
    train_loader = make_loaders(
        dataframe=train_dataframe,
        vocabulary=Vocabulary(freq_threshold=config.FREQ_THRESHOLD),
        transforms=get_transforms(mode="train"),
        mode="train",
    )
    vocab = train_loader.dataset.vocab
    valid_loader = make_loaders(
        dataframe=valid_dataframe,
        vocabulary=vocab,
        transforms=get_transforms(mode="valid"),
        mode="valid",
    )
    encoder, decoder, encoder_optimizer, decoder_optimizer = build_model(
        vocab_size=vocab.vocab_size)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
    encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        encoder_optimizer,
        factor=config.FACTOR,
        patience=config.PATIENCE,
        verbose=True)
    decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        decoder_optimizer,
        factor=config.FACTOR,
        patience=config.PATIENCE,
        verbose=True)

    for epoch in range(config.EPOCHS):
        train_loss = train_one_epoch(
            train_loader,
            encoder,
            decoder,
            criterion,
            encoder_optimizer,
            decoder_optimizer,
            config.DEVICE,
        )

        # encoder_scheduler.step(valid_loss.avg)
        # decoder_scheduler.step(valid_loss.avg)

        predict(valid_loader, encoder, decoder, config.DEVICE)
Example #29
0
def main():
    print('Invoked as:', ' '.join(sys.argv), file=sys.stderr)
    parser = argparse.ArgumentParser()
    parser.add_argument('corpus')
    parser.add_argument('dev_corpus')
    parser.add_argument('--layers', type=int, default=1)
    parser.add_argument('--emb_dim', type=int, default=128)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser.add_argument('--minibatch_size', type=int, default=1)
    parser.add_argument('--tied', action='store_true')
    parser.add_argument('--autobatch', action='store_true')
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--output', type=str, default='')
    harness.add_optimizer_args(parser)
    args = parser.parse_args()

    if args.output == '':
        args.output = '/tmp/model%d' % random.randint(0, 0xFFFF)
    print('Output file:', args.output, file=sys.stderr)

    vocab = Vocabulary()
    train_corpus = read_corpus(args.corpus, vocab)
    dev_corpus = read_corpus(args.dev_corpus, vocab)
    print('Vocab size:', len(vocab), file=sys.stderr)

    with open(args.output + '.vocab', 'w') as f:
        for word in vocab.i2w:
            print(word, file=f)

    pc = dy.ParameterCollection()
    optimizer = harness.make_optimizer(args, pc)
    model = RNNLM(pc, args.layers, args.emb_dim, args.hidden_dim, len(vocab),
                  args.tied)
    print('Total parameters:', pc.parameter_count(), file=sys.stderr)

    harness.train(model, train_corpus, dev_corpus, optimizer, args)
Example #30
0
def main():

    train_dataframe, valid_dataframe = make_train_valid_dfs()
    train_loader = make_loaders(
        dataframe=train_dataframe,
        vocabulary=Vocabulary(freq_threshold=config.FREQ_THRESHOLD),
        transforms=get_transforms(mode="train"),
        mode="train",
    )
    vocab = train_loader.dataset.vocab
    valid_loader = make_loaders(
        dataframe=valid_dataframe,
        vocabulary=vocab,
        transforms=get_transforms(mode="valid"),
        mode="valid",
    )

    # model = CaptioningTransformer(vocab_size=vocab.vocab_size, d_model=config.D_MODEL).to(config.DEVICE)
    model = TransformerCaptioning(vocab_size=vocab.vocab_size).to(
        config.DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.LR)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                              factor=0.8,
                                                              patience=3)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
    train_eval(
        config.EPOCHS,
        model,
        train_loader,
        valid_loader,
        criterion,
        optimizer,
        config.DEVICE,
        config,
        lr_scheduler,
    )
Example #31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('model')
    parser.add_argument('vocab')
    parser.add_argument('corpus')
    parser.add_argument('--layers', type=int, default=1)
    parser.add_argument('--emb_dim', type=int, default=128)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser.add_argument('--minibatch_size', type=int, default=1)
    parser.add_argument('--tied', action='store_true')
    parser.add_argument('--autobatch', action='store_true')
    parser.add_argument('--sent_level', action='store_true')
    args = parser.parse_args()

    vocab = Vocabulary()
    with open(args.vocab) as f:
        for line in f:
            word = line.strip()
            vocab.convert(word)
    print('Loaded a vocabulary of size %d' % (len(vocab)))
    eos = vocab.convert('</s>')

    pc = dy.ParameterCollection()
    rnnlm = RNNLM(pc, args.layers, args.emb_dim, args.hidden_dim, len(vocab),
                  args.tied)
    pc.populate_from_textfile(args.model)
    #rnnlm, = dy.load(args.model, pc)
    print('Total parameters:', pc.parameter_count())
    """for i in range(100):
    rnnlm.new_graph()
    sampled_sent = rnnlm.sample(eos, 100)
    sampled_sent = [vocab.to_word(word_id) for word_id in sampled_sent]
    print(' '.join(sampled_sent))
    sys.stdout.flush()
  sys.exit(0)"""

    rnnlm.set_dropout(0.0)
    vocab.frozen = True
    corpus = read_corpus(args.corpus, vocab)
    run_test_set(rnnlm, corpus, args)