Exemple #1
0
def train(args):
    with open(args.param_file, 'w') as f:
        param = vars(args)
        del param['handler']
        json.dump(param, f, indent=4)

    feature_vocab = Vocabulary.load('feature.dict')
    category_vocab = Vocabulary.load('category.dict')

    data = torch.load('train.pt')

    pad = feature_vocab.get_index('<pad>')

    model = net.Classifier(feature_vocab,
                           category_vocab,
                           embedding_size=args.embedding_size,
                           embedding_path=args.embedding_path,
                           freeze_embedding=args.freeze_embedding,
                           hidden_size=args.hidden_size,
                           num_layers=args.num_layers,
                           weight_dropout=args.weight_dropout)

    if args.gpu >= 0:
        model.cuda(args.gpu)
    print(model)

    optimizer = torch.optim.AdamW(model.parameters())
    print(optimizer)

    model.train()
    optimizer.zero_grad()

    for epoch in range(args.max_epochs):
        loss_epoch = 0.
        step = 0
        for batch in torch.utils.data.DataLoader(
                data,
                batch_size=args.batch_size,
                shuffle=True,
                collate_fn=collate_fn(pad),
        ):
            optimizer.zero_grad()

            if args.gpu >= 0:
                batch = move_to_cuda(batch, args.gpu)

            loss = net.loss_fn(model, batch)

            loss.backward()
            loss_epoch += loss.item()
            del loss

            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)

            optimizer.step()

            step += 1
        print(f'epoch:{epoch+1}: loss:{loss_epoch:.5f}')
    torch.save(model.state_dict(), args.model)
    evaluate(args)
Exemple #2
0
class SelfVocab():

    def __init__(self, dataset):

        self.dataset = dataset

        self.vocab = Vocabulary()
        self.vocab.fromSentances(dataset.X)
        
        self.getXY()
        self.getEmbeddingsMatrix()


    def getXY(self):

        seqs_with_idx = self.vocab.docs_to_indices()
        self.X = array(seqs_with_idx, dtype=object)

        self.X = pad_sequences(self.X, maxlen=100)

        Y = [0 if y==0 else 1 for y in self.dataset.Y]
        self.Y = Y #array(Y)

    def getEmbeddingsMatrix(self):

        self.Wvec = self.vocab.getWord2VecMatrix()
Exemple #3
0
def main():
    args = get_arguments()
    SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8')))
    print(args)
    args.device = list (map(str,args.device))
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device)

    # image transformer
    transform = transforms.Compose([
        transforms.Resize((SETTING.imsize, SETTING.imsize)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
        ])

    if args.dataset == 'coco':
        val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform)
    val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater)

    vocab = Vocabulary(max_len=SETTING.max_len)
    vocab.load_vocab(args.vocab_path)

    imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type)
    capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type)

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    imenc = imenc.to(device)
    capenc = capenc.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True)
    ckpt = torch.load(args.checkpoint, map_location=device)
    imenc.load_state_dict(ckpt["encoder_state"])
    capenc.load_state_dict(ckpt["decoder_state"])

    begin = time.time()
    dset = EmbedDset(val_loader, imenc, capenc, vocab, args)
    print("database created | {} ".format(sec2str(time.time()-begin)), flush=True)

    savedir = os.path.join("out", args.config_name)
    if not os.path.exists(savedir):
        os.makedirs(savedir, 0o777)

    image = dset.embedded["image"]
    caption = dset.embedded["caption"]
    n_i = image.shape[0]
    n_c = caption.shape[0]
    all = np.concatenate([image, caption], axis=0)

    emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i))
    save_file = os.path.join(savedir, "{}.npy".format(SETTING.method))
    vis_file = os.path.join(savedir, "{}.png".format(SETTING.method))
    np.save(emb_file, all)
    print("saved embeddings to {}".format(emb_file), flush=True)
    dimension_reduction(emb_file, save_file, method=SETTING.method)
    plot_embeddings(save_file, n_i, vis_file, method=SETTING.method)
def main():

    args = get_arguments()
    SETTING = Dict(
        yaml.safe_load(
            open(os.path.join('arguments', args.arg + '.yaml'),
                 encoding='utf8')))
    print(args)
    args.device = list(map(str, args.device))
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device)

    transform = transforms.Compose([
        transforms.Resize((SETTING.imsize, SETTING.imsize)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    if args.dataset == 'coco':
        val_dset = CocoDset(root=SETTING.root_path,
                            img_dir='val2017',
                            ann_dir='annotations/captions_val2017.json',
                            transform=transform)
    val_loader = DataLoader(val_dset,
                            batch_size=SETTING.batch_size,
                            shuffle=False,
                            num_workers=SETTING.n_cpu,
                            collate_fn=collater)

    vocab = Vocabulary(max_len=SETTING.max_len)
    vocab.load_vocab(args.vocab_path)

    imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type)
    capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size,
                            SETTING.rnn_type)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    imenc = imenc.to(device)
    capenc = capenc.to(device)

    assert SETTING.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        SETTING.checkpoint),
          flush=True)
    ckpt = torch.load(SETTING.checkpoint)
    imenc.load_state_dict(ckpt["encoder_state"])
    capenc.load_state_dict(ckpt["decoder_state"])

    begin = time.time()
    dset = EmbedDset(val_loader, imenc, capenc, vocab, args)
    print("database created | {} ".format(sec2str(time.time() - begin)),
          flush=True)

    retrieve_i2c(dset, val_dset, args.image_path, imenc, transform)
    retrieve_c2i(dset, val_dset, args.output_dir, args.caption, capenc, vocab)
Exemple #5
0
def prepare_data(args):
    '''
    Do all the job about preparing data.
    :param args:
    :return:
    '''
    trainset = REDataset(args.trainset_path, double_data=args.is_double_training_data)
    testset = REDataset(args.testset_path)

    # make vocab
    vocab = Vocabulary(word_num=args.vocab_word_num)
    corpus = []
    for example in trainset:
        corpus += example[0]

    if args.vocab_include_testset:
        for example in testset:
            corpus += example[0]

    vocab.add_from_corpus(corpus)

    # make label encoder
    all_labels = []
    for example in trainset:
        all_labels.append(example[1])
    label_encoder = LabelEncoder(all_labels)

    batch_maker = BatchMaker(vocab, label_encoder, max_length=args.max_length)
    traindata_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=args.shuffle,
                                  num_workers=args.num_workers, collate_fn=batch_maker.batch_packer)
    testdata_loader = DataLoader(testset, batch_size=args.test_batch_size, shuffle=args.test_shuffle,
                                 num_workers=args.num_workers, collate_fn=batch_maker.batch_packer)

    logger.info('trainset length: %d' % len(trainset))
    logger.info('testset length: %d' % len(testset))
    logger.info('vocabulary length: %d'%len(vocab))
    logger.info('labels num: %d'%len(label_encoder))

    return (traindata_loader, testdata_loader, trainset, testset, vocab, label_encoder, batch_maker)

# dataset = REDataset(TRAINSET_PATH)
# corpus = []
# for example in dataset:
#     corpus += example[0]
#
# vocab = Vocabulary(word_num=3000)
# vocab.add_from_corpus(corpus)
# sent = dataset[0][0]
# print(sent)
# print(vocab.encode(sent))
# print(vocab.decode(vocab.encode(sent)))
# print(dataset[0][-1])
Exemple #6
0
    def __init__(self, data_dir, mode, vocab_size):

        self.df = pd.read_csv(os.path.join(data_dir, mode + '.csv'))

        self.sentences = self.df['text'].values
        self.labels = self.df['label'].values

        # Initialize dataset Vocabulary object and build our vocabulary
        self.sentences_vocab = Vocabulary(vocab_size)
        self.labels_vocab = Vocabulary(vocab_size)

        self.sentences_vocab.build_vocabulary(self.sentences)
        self.labels_vocab.build_vocabulary(self.labels, add_unk=False)
Exemple #7
0
def evaluate(args):
    feature_vocab = Vocabulary.load('feature.dict')
    category_vocab = Vocabulary.load('category.dict')

    with open(args.param_file, 'r') as f:
        params = json.load(f)

    model = net.Classifier(feature_vocab, category_vocab, **params)

    model.load_state_dict(torch.load(args.model))
    if args.gpu >= 0:
        model = model.cuda(args.gpu)

    test_data = torch.load('test.pt')
    predictions = []
    targets = []
    model.eval()
    pad = feature_vocab.get_index('<pad>')
    match = 0
    with torch.no_grad():
        for batch in torch.utils.data.DataLoader(
                test_data,
                batch_size=args.batch_size,
                shuffle=False,
                collate_fn=collate_fn(pad),
        ):

            if args.gpu >= 0:
                batch = move_to_cuda(batch, args.gpu)

            pred = torch.argmax(model(batch), dim=-1)
            target = batch['label']

            match += (pred == target).sum().item()

            predictions.extend(pred.tolist())
            targets.extend(target.tolist())

    acc = match / len(targets)
    prec, rec, fscore, _ = precision_recall_fscore_support(
        predictions, targets)
    print('Acc', acc)
    print('===')
    print('Category', 'Precision', 'Recall', 'Fscore', sep='\t')
    for idx in range(len(category_vocab)):
        print(f'{category_vocab.get_item(idx)}\t'
              f'{prec[idx]:.2f}\t{rec[idx]:.2f}\t{fscore[idx]:.2f}')
    prec, rec, fscore, _ = precision_recall_fscore_support(predictions,
                                                           targets,
                                                           average='micro')
    print(f'Total\t{prec:.2f}\t{rec:.2f}\t{fscore:.2f}')
def build_vocab(datafile, threshold):
    counter = Counter()

    with open(datafile, 'r') as f:
        data = json.load(f)

    for caption in tqdm(list(map(lambda x: x['caption'], data))):
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

    tokens = [token for token, count in counter.items() if count >= threshold]
    vocab = Vocabulary()
    vocab.add_tokens(tokens)
    return vocab
def main():

    args = _parse_args()

    assert not(os.path.exists(args.model)), f"specified file already exists: {args.model}"

    with io.open(args.corpus, mode="r") as corpus:
        v = Vocabulary(table_size=int(2E7))
        v.create(corpus, [(args.n_vocab, args.n_min_freq, args.n_min_freq)])

    print(f"finished. saving models: {args.model}")
    v.save(args.model)

    # sanity check
    print("done. now execute sanity check...")
    print(f"n_vocab: {len(v)}, total_freq:{sum(v.counts)}")

    s = "Knox County Health Department is following national Centers for Disease Control and Prevention Protocol to contain infection."
    print(f"sentence: {s}")
    s_tokenized = "/".join(v.tokenize(s, remove_oov=False))
    print(f"tokenized: {s_tokenized}")
    print(f"random sampling...")
    n_sample = 100
    x = v.random_ids(n_sample)
    w, f = np.unique(list(map(v.id2word, x)), return_counts=True)
    for idx in np.argsort(f)[::-1]:
        print(f"{w[idx]} -> {f[idx]}")

    print("finished. good-bye.")
Exemple #10
0
def main():
    args = parse_args()

    transform = transforms.Compose([
        transforms.Resize(args.imsize_pre),
        transforms.CenterCrop(args.imsize),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    if args.dataset == "coco":
        val_dset = CocoDataset(
            root=args.root_path,
            split="val",
            transform=transform,
        )
    val_loader = DataLoader(
        val_dset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.n_cpu,
        collate_fn=collater,
    )

    vocab = Vocabulary(max_len=args.max_len)
    vocab.load_vocab(args.vocab_path)

    model = SPVSE(
        len(vocab),
        args.emb_size,
        args.out_size,
        args.max_len,
        args.cnn_type,
        args.rnn_type,
        pad_idx=vocab.padidx,
        bos_idx=vocab.bosidx,
    )

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    model = model.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        args.checkpoint),
          flush=True)
    ckpt = torch.load(args.checkpoint, map_location=device)
    model.load_state_dict(ckpt["model_state"])
    _ = validate(1000, val_loader, model, vocab, args)
Exemple #11
0
def main():
    args = parse_args()

    if args.dynet_seed:
        random.seed(args.dynet_seed)
        np.random.seed(args.dynet_seed)

    src_vocab = Vocabulary('<unk>', eos_symbol='</s>')
    tgt_vocab = Vocabulary('<unk>', sos_symbol='<s>', eos_symbol='</s>')
    train = list(
        read_bitext(src_vocab, tgt_vocab, args.train_src, args.train_tgt))
    src_vocab.freeze()
    tgt_vocab.freeze()
    dev = list(read_bitext(src_vocab, tgt_vocab, args.dev_src, args.dev_tgt))
    # init model
    model = Seq2SeqAtt(src_vocab, tgt_vocab, args.src_embed_dim,
                       args.tgt_embed_dim, args.enc_nlayers,
                       args.enc_hidden_dim, args.dec_nlayers,
                       args.dec_hidden_dim, args.attention_dim,
                       args.label_smoothing)
    if args.saved_model:
        model.load_model(args.saved_model)
    if args.only_decode:
        print("Reading test data...")
        test = list(
            read_bitext(src_vocab, tgt_vocab, args.test_src, args.test_tgt))
        model.translate(test, args.beam_size, args.max_output_len,
                        args.length_norm, args.output_file, args.relative,
                        args.absolute, args.local, args.candidate)
        print("Done")
    else:
        training_procedure = BasicTrainingProcedure(
            model, dy.SimpleSGDTrainer(model.pc))
        training_procedure.train(args.epochs, train, dev, args.batch_size,
                                 args.batch_size, args.max_output_len)
Exemple #12
0
    def __init__(self, embed_size: int, src_vocab: Vocabulary,
                 dst_vocab: Vocabulary):
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # 默认值
        src_padding_idx = src_vocab.word2idx[Vocabulary.pad_token()]
        dst_padding_idx = dst_vocab.word2idx[Vocabulary.pad_token()]
        self.src_embedding = nn.Embedding(len(src_vocab),
                                          embed_size,
                                          padding_idx=src_padding_idx)
        self.dst_embedding = nn.Embedding(len(dst_vocab),
                                          embed_size,
                                          padding_idx=dst_padding_idx)
Exemple #13
0
def main():
    args = parse_args()

    transform = transforms.Compose([
        transforms.Resize((args.imsize, args.imsize)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    if args.dataset == 'coco':
        val_dset = CocoDataset(root=args.root_path,
                               imgdir='val2017',
                               jsonfile='annotations/captions_val2017.json',
                               transform=transform,
                               mode='all')
    val_loader = DataLoader(val_dset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.n_cpu,
                            collate_fn=collater_eval)

    vocab = Vocabulary(max_len=args.max_len)
    vocab.load_vocab(args.vocab_path)

    imenc = ImageEncoder(args.out_size, args.cnn_type)
    capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size,
                            args.rnn_type)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    imenc = imenc.to(device)
    capenc = capenc.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        args.checkpoint),
          flush=True)
    ckpt = torch.load(args.checkpoint)
    imenc.load_state_dict(ckpt["encoder_state"])
    capenc.load_state_dict(ckpt["decoder_state"])

    begin = time.time()
    dset = EmbedDataset(val_loader, imenc, capenc, vocab, args)
    print("database created | {} ".format(sec2str(time.time() - begin)),
          flush=True)

    retrieve_i2c(dset, val_dset, imenc, vocab, args)
    retrieve_c2i(dset, val_dset, capenc, vocab, args)
Exemple #14
0
def train():
    with open('train_config.json') as train_config_file:
        train_config = json.load(train_config_file)
    train_data_path = train_config['train_data_path']
    test_data_path = train_config['test_data_path']
    vocab_path = train_config['vocab_path']

    train_input_data, train_input_label = load_corpus(
        file_path=train_data_path, make_vocab=True, vocab_path=vocab_path)
    val_input_data, val_input_label = load_corpus(file_path=test_data_path,
                                                  make_vocab=False)

    vocab = Vocabulary(vocab_path)

    model = Spacing(vocab_len=len(vocab))

    print(model)

    trainer = Trainer(model=model,
                      vocab=vocab,
                      train_data=train_input_data,
                      train_label=train_input_label,
                      val_data=val_input_data,
                      val_label=val_input_label,
                      config=train_config)
    trainer.train(total_epoch=10, validation_epoch=1)
Exemple #15
0
    def toShakespeare(self):
        """Given a line of text, return that text in the indicated style.
        
        Args:
          modern_text: (string) The input.
          
        Returns:
          string: The translated text, if generated.
        """

        args = load_arguments()
        vocab = Vocabulary(self.vocab_path, args.embedding, args.dim_emb)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            model = Model(args, vocab)
            model.saver.restore(sess, args.model)

            if args.beam > 1:
                decoder = beam_search.Decoder(sess, args, vocab, model)
            else:
                decoder = greedy_decoding.Decoder(sess, args, vocab, model)

                batch = get_batch([self.modern_text], [1], vocab.word2id)
                ori, tsf = decoder.rewrite(batch)

                out = ' '.join(w for w in tsf[0])

        return out
def run_evaluation(corpus_dir, save_dir, datafile, config_file):
    config = Config.from_json_file(config_file)
    vocab = Vocabulary("words")

    # set checkpoint to load from; set to None if starting from scratch
    load_filename = os.path.join(
        save_dir, config.model_name, config.corpus_name,
        '{}-{}_{}'.format(config.encoder_n_layers, config.decoder_n_layers,
                          config.hidden_size), 'last_checkpoint.tar')

    # if loading on the same machine the model trained on
    checkpoint = torch.load(load_filename)
    # if loading a model trained on gpu to cpu
    # checkpoint = torch.load(load_filename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint["en"]
    decoder_sd = checkpoint["de"]
    encoder_optimizer_sd = checkpoint["en_opt"]
    decoder_optimizer_sd = checkpoint["de_opt"]
    embedding_sd = checkpoint["embedding"]
    vocab.__dict__ = checkpoint["voc_dict"]

    print("Building encoder and decoder ...")
    # initialize word embeddings
    embedding = nn.Embedding(vocab.num_words, config.hidden_size)
    embedding.load_state_dict(embedding_sd)

    # initialize encoder and decoder models
    encoder = EncoderRNN(config.hidden_size, embedding,
                         config.encoder_n_layers, config.dropout)
    decoder = LuongAttnDecoderRNN(config.attn_model, embedding,
                                  config.hidden_size, vocab.num_words,
                                  config.decoder_n_layers, config.dropout)

    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

    # Set dropout layers to eval mode

    encoder.eval()
    decoder.eval()

    # Initialize search module
    searcher = GreedySearchDecoder(encoder, decoder)

    # Begin chatting (uncomment and run the following line to begin)
    evaluate_input(encoder, decoder, searcher, vocab)
    def read_vocabs(self, datafile, corpus_name):
        lines = open(datafile, encoding="utf-8").read().strip().split('\n')

        pairs = [[self.normalize_string(s) for s in line.split('\t')]
                 for line in lines]
        vocab = Vocabulary(corpus_name)

        return vocab, pairs
    def __init__(self, data_path, vocab=Vocabulary(), predict=False):
        """
        Creates an object that gets data from a file.
        """
        super(Data, self).__init__(data_path, vocab)

        if not predict:
            self._train_test_split()
Exemple #19
0
def load_word_data(questions_df, image_captions, exclude_word_list):
    vocab = Vocabulary()
    answers = Vocabulary(first_word="RELEVANT")
    question_seq_length = 1
    caption_seq_length = 1

    print "Generating vocabulary and answer indices..."
    new_questions = []
    for _, row in questions_df.iterrows():
        question_words = row['question'].split(' ')

        if len(question_words) > question_seq_length:
            question_seq_length = len(question_words)

        all_words = question_words

        image_file = row['image_file']
        if image_file in image_captions:
            caption = image_captions[image_file]
            caption_words = caption.split(' ')
            if len(caption_words) > caption_seq_length:
                caption_seq_length = len(caption_words)
            all_words += caption_words

        for word in all_words:
            if len(word) > 0 and word not in exclude_word_list:
                vocab.add_word(word)
        # if row['relevant'] == 0:
        answers.add_word(row['answer'])

    print '\tVocab count: [%d]' % (len(vocab))
    print '\tAnswers count: [%d]' % (len(answers))
    print '\tQuestion sequence length: [%d]' % (question_seq_length)
    print '\tCaption sequence length: [%d]' % (caption_seq_length)

    print "Loading word vectors..."
    word_to_vector = load_word_vectors(word_vectors_file, vocab)

    print 'Creating embedding matrix...'
    embedding_matrix = np.zeros((len(vocab), embedding_dim))

    words_not_found = []
    for word, i in vocab.word_index.items():
        if word not in word_to_vector:
            words_not_found.append(word)
            continue
        embedding_vector = word_to_vector[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    if len(words_not_found) > 0:
        print "Words not found:", "\n\t", words_not_found
        for word in words_not_found:
            del vocab.index_word[vocab.word_index[word]]

    return vocab, answers, embedding_matrix, word_to_vector, question_seq_length, caption_seq_length
    def from_serializable(cls, contents):
        token_vocab = TokenVocabulary.from_serializable(
            contents["token_vocab"])
        tag_vocab = Vocabulary.from_serializable(contents["tag_vocab"])

        return cls(token_vocab=token_vocab,
                   tag_vocab=tag_vocab,
                   max_seq_len=contents["max_seq_len"])
Exemple #21
0
def create_vocab(qas, threshold=4):
    counter = Counter()
    for qa in qas:
        question = qa['question'].encode('utf-8')
        answer = qa['answer'].encode('utf-8')
        qtokens = nltk.tokenize.word_tokenize(question.lower())
        atokens = nltk.tokenize.word_tokenize(answer.lower())
        counter.update(qtokens)
        counter.update(atokens)

    # If a word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Adds the words to the vocabulary.
    vocab = Vocabulary()
    for word in words:
        vocab.add_word(word)
    return vocab
Exemple #22
0
    def __init__(self, dataset):

        self.dataset = dataset

        self.vocab = Vocabulary()
        self.vocab.fromSentances(dataset.X)
        
        self.getXY()
        self.getEmbeddingsMatrix()
    def __init__(self, data_path, vocab=Vocabulary()):
        self.vocab = vocab

        data = get_requests_from_file(data_path)
        print("Downloaded {} samples".format(len(data)))

        map_result = map(self._process_request, data)
        self.data = [x[0] for x in map_result]
        self.lengths = [x[1] for x in map_result]
        assert len(self.data) == len(self.lengths)
def main():

    args = _parse_args()

    assert not(os.path.exists(args.model)), f"specified file already exists: {args.model}"

    pprint(args.__dict__)

    vocab_params = {
        "power":0.75
    }
    vocab = Vocabulary.load(args.vocab, **vocab_params)
    n_vocab = len(vocab)
    print(f"vocabulary size: {n_vocab}")

    kwargs = {} if args.kwargs is None else json.loads(args.kwargs)
    pprint(kwargs)

    init_params = {
        'mu0': 0.1,
        'sigma_mean0': 1.0,
        'sigma_std0': 0.01
    }
    model_params = {
        "mu_max":1.0,
        "sigma_min":0.1,
        "sigma_max":10.0,
        "eta":0.01,
        "Closs":4.0
    }

    print("start training...")
    model = GaussianEmbedding(n_vocab, args.n_dim, covariance_type=args.cov_type, energy_type="KL", init_params=init_params, **model_params)
    with io.open(args.corpus, mode="r") as corpus:
        it = iter_pairs(corpus, vocab, batch_size=20, nsamples=20, window=5)
        model.train(it, n_workers=args.n_thread)

    print(f"finished. saving models: {args.model}")
    model.save(args.model)

    # sanity check
    print("done. now execute sanity check...")

    def ln_det_sigma(word):
        vec_sigma = model.sigma[vocab.word2id(word)]
        return np.sum(np.log(vec_sigma))
    
    w = "food"
    print(f"word: {w}")
    lst_result = model.nearest_neighbors(w, vocab=vocab, sort_order="sigma", num=100)
    df_result = pd.DataFrame(lst_result)
    df_result["sigma_ln_det"] = df_result["word"].map(ln_det_sigma)
    print(df_result.sort_values(by="sigma_ln_det", ascending=False).head(n=10))

    print("finished. good-bye.")
Exemple #25
0
def convert_to_str(
    tensor: np.ndarray,
    vocab: Vocabulary,
) -> List[List[str]]:
    output = []
    for batch in range(len(tensor)):
        curr = []
        for idx in range(len(tensor[batch])):
            curr.append(vocab.idx2word(tensor[batch, idx]))
        output.append(curr)
    return output
Exemple #26
0
def load_or_create_vocab(trainDataset=None, testDataset=None):
    Texts = list(trainDataset.anns.values()) + list(testDataset.anns.values())
    if os.path.exists(VOCAB_FILE):
        print("loading vocab")
        vocab = torch.load(VOCAB_FILE)
        print("vocab loaded")
        return vocab
    else:
        vocab = Vocabulary()
        vocab.create_from_texts(Texts)
        return vocab
 def __init__(self, csv_path, image_path, transform=None, batch_size=4):
     self.captionsfile = pd.read_csv(csv_path)
     self.image_path = image_path
     self.transform = transform
     self.vocab = Vocabulary(vocab_threshold=2)
     self.batch_size = batch_size
     all_tokens = [
         nltk.tokenize.word_tokenize(
             str(self.captionsfile.iloc[index, 2]).lower())
         for index in range(len(self.captionsfile))
     ]
     self.caption_lengths = [len(token) for token in all_tokens]
Exemple #28
0
class TextDataset(Dataset):
    def __init__(self, data_dir, mode, vocab_size):

        self.df = pd.read_csv(os.path.join(data_dir, mode + '.csv'))

        self.sentences = self.df['text'].values
        self.labels = self.df['label'].values

        # Initialize dataset Vocabulary object and build our vocabulary
        self.sentences_vocab = Vocabulary(vocab_size)
        self.labels_vocab = Vocabulary(vocab_size)

        self.sentences_vocab.build_vocabulary(self.sentences)
        self.labels_vocab.build_vocabulary(self.labels, add_unk=False)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        #numericalize the sentence ex) ['cat', 'in', 'a', 'bag'] -> [2,3,9,24,22]
        numeric_sentence = self.sentences_vocab.sentence_to_numeric(sentence)
        numeric_label = self.labels_vocab.sentence_to_numeric(label)

        return torch.tensor(numeric_sentence), torch.tensor(numeric_label)
Exemple #29
0
def build_vocab(words):
    ''' Build vocabulary and use it to format labels. '''
    vocab = Vocabulary(words)

    # Map words to word embedding vectors.
    output_vector = []
    for word in words:
        zeros = np.zeros(len(vocab), dtype=np.float32)
        zeros[vocab[word]] = 1.0

        output_vector.append(zeros)

    return vocab, output_vector
Exemple #30
0
def inference():
    with open('train_config.json') as train_config_file:
        train_config = json.load(train_config_file)

    vocab_path = train_config['vocab_path']
    model_save_path = train_config['model_save_path']

    epoch = None
    with open(os.path.join(model_save_path, 'checkpoint.txt')) as f:
        epoch = f.readlines()[0].split(':')[1]
        print(f'Weight is loaded from best checkpoint epoch {epoch}')

    vocab = Vocabulary(vocab_path)

    model = Spacing(vocab_len=len(vocab)).eval()

    trainer = Trainer(model=model,
                      vocab=vocab,
                      config=train_config)
    trainer.load(epoch)

    while True:
        text = input('Enter input text : ')
        words = text.split()
        data = []

        for word in words:
            chars = [char for char in word]
            data.append(chars)
        sorted_data = sorted(data, key=lambda e: len(e), reverse=True)
        idx = sorted(range(len(data)), key=lambda e: len(data[e]), reverse=True)
        batch_data, batch_label, lengths = trainer.make_input_tensor(sorted_data, None)

        outputs, _ = trainer.model.forward(batch_data, lengths)
        outputs = torch.round(outputs)

        results = []
        for output, data in zip(outputs, sorted_data):
            result = ''
            for output_char, char in zip(output, data):
                if output_char == 1:
                    result += (char + ' ')
                else:
                    result += char
            results.append(result)

        sorted_result = ''
        for i in range(len(idx)):
            sorted_result += results[idx.index(i)]

        print(sorted_result)
Exemple #31
0
def ngrams(prefix):
    """
    Find n-grams and make a vocabulary from the parsed corpus
    """
    with BZ2File(prefix + 'corpus.bz2', 'r') as corpus:
        vocab = Vocabulary(build_table=False)
        vocab.create(corpus, [(75000, 350), (25000, 350), (10000, 350)])
        vocab.save(prefix + 'vocab.gz')
Exemple #32
0
    def __init__(self, data_path, train=False, longest_sequence_length=None):

        data0 = load_sent(data_path + '.0')
        data1 = load_sent(data_path + '.1')
        print(
            f'\n------------------------ Building a Dataset ------------------------'
        )
        print(f'#sents of {data_path}.0 file 0: {len(data0)}'
              )  # list of list of tokenized words
        print(f'#sents of {data_path}.1 file 1: {len(data1)}'
              )  # list of list of tokenized words

        self.data_all = data0 + data1
        self.style_list = [0 for i in data0] + [
            1 for i in data1
        ]  # data0 is all neg, data1 is all pos

        # sorting all the data according to their seq lengths in descending order
        zip_item = zip(self.data_all, self.style_list)
        sorted_item = sorted(zip_item, key=lambda p: len(p[0]), reverse=True)
        tuple_item = zip(*sorted_item)
        self.data_all, self.style_list = [list(t) for t in tuple_item]

        print(f'len(self.data_all)  : {len(self.data_all)}')
        print(f'len(self.style_list): {len(self.style_list)}')

        if train:
            print('\ntrain: True')
            if not os.path.isfile(cfg.vocab):
                print(f'{cfg.vocab} does not exist')
                print('Building Vocab...')
                build_vocab(data0 + data1, cfg.vocab)
            else:
                print(f'{cfg.vocab} already exists')

        self.vocab = Vocabulary(cfg.vocab, cfg.embedding_file, cfg.embed_dim)
        print('\nvocabulary size:', self.vocab.size)
        print(
            f'vocabulary embedding matrix shape: {self.vocab.embedding.shape}')
        # print(type(self.vocab.embedding)) # np array

        self.longest_sequence_length = longest_sequence_length

        if longest_sequence_length is None:
            self.update_the_max_length()

        print(f'self.longest_sequence_length: {self.longest_sequence_length}')
        print(
            f'--------------------------------------------------------------------'
        )
Exemple #33
0
def count(prefix):
    """
    Count the number of tokens in the corpus
    """
    vocab = Vocabulary.load(prefix + 'vocab.gz', build_table=False)
    total = 0
    ndocs = 0
    with BZ2File(prefix + 'corpus.bz2', 'r') as corpus:
        for doc in corpus:
            tokens = vocab.tokenize(doc)
            total += len(tokens)
            ndocs += 1
            if ndocs % 10000 == 0:
                logger.info("Processed %s docs." % ndocs)
    logger.info("Total of %s tokens in corpus" % total)
Exemple #34
0
input_sentence = 'the dog ran'.split()

rules = [
    ('S', ['NP', 'VP']),
    ('VP', ['V']),
    ('NP', ['DET', 'N']),
    ]
    
words = {
    'N': 'man dog'.split(),
    'DET': 'a the'.split(),
    'V': 'ran saw'.split(),
    }
    
vocab = Vocabulary(D)    

for category, items in words.items():
    for item in items:
        vocab.add(item, vocab.parse('CATEGORY*%s+TEXT*text_%s'%(category, item)))
    
sp_goal = vocab.parse('S')

    
for input in input_sentence:
    print 'parsing text:', input
    sp_lex = vocab.parse(input)
    
    category = sp_lex*vocab.parse('~CATEGORY')
    
    while True:
clues1 = clues1[:len(clues4)]
clues2 = clues2[:len(clues4)]
print len(clues1), len(clues2), len(clues4)
oldclues = clues2 + clues4
clues = clues2 + clues4
#vocab = Vocabulary()
#clues = []
#for i in oldclues:
    #if i not in clues:
        #clues.append(i)
#for clue in oldclues:
    #vocab.add_question(clue)
path = Path(args.foldername)
if not path.exists():
    path.mkdir()
vocab = Vocabulary.load(inputpath3 / "vocab.pkl")
vocab.save(path / "vocab.pkl")
print vocab.number
matrix = lil_matrix((len(clues), vocab.number + 1))
matrix = matrix.astype(np.int64)
i = 0
for clue in tqdm(clues):
    vector = vocab.translate(clue)
    matrix[i] = vector
    i += 1
print matrix.shape
matrix = csr_matrix(matrix)
#np.save(path / "matrix.npy", matrix)
#matrix = pickle.load(open(inputpath4 / "matrix.pkl", "rb"))
#print matrix.shape
#idx = np.arange(matrix.shape[1])
Exemple #36
0
        with open(test_file,'r') as fin:
            if hasTitle:
                fin.readline()
            for line in fin:
                lline = line.strip().split('\t')
                id_1 = self._vocab.word2id(lline[0].lower())
                id_2 = self._vocab.word2id(lline[1].lower())
                score_human.append(float(lline[2]))

                if flag == 'cosine':
                    s = cosine(self.mu[id_1,:], self.mu[id_2,:])
                elif flag == 'IP':
                    s = logIP(self.mu[id_1,:],self.sigma[id_1,:],
                              self.mu[id_2,:],self.sigma[id_2,:])

                score_model.append(s)
        coeff = stats.pearsonr(numpy.array(score_human),numpy.array(score_model))
        print coeff


if __name__ == '__main__':
    from vocab import Vocabulary
    import os
    work_dir = os.getcwd()
    test_file = work_dir + '/dataset/wordsim353/combined'
    vocab = Vocabulary.load(work_dir + '/dataset/vocab.new.gz')
    embed = Embedding(work_dir + '/embedding_result/Result/May5/embedding.tar.gz',
                      vocab)


Exemple #37
0
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 29 17:02:33 2016

@author: whr94621
"""

import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
                    level=logging.INFO)

from gzip import GzipFile

from word2gauss import GaussianEmbedding, iter_pairs
from vocab import Vocabulary

import sys

vocab = Vocabulary.load(sys.argv[2])

embed = GaussianEmbedding(len(vocab),50,covariance_type='diagonal',energy_type='KL',mu_max = 4.0, sigma_min = 1,
        sigma_max = 2)

with GzipFile(sys.argv[1], 'r') as corpus:
    for i in xrange(50):
        embed.train(iter_pairs(corpus, vocab, iterations=20), n_workers=16)

embed.save('embedding.tar.gz', vocab=vocab.id2word, full=True)
args = argparser.parse_args()
path = Path(args.foldername)
if not path.exists():
    path.mkdir()
with open("%s/clues.pkl" % args.inputname, "rb") as f:
    clues = pickle.load(f)
docs = np.load("%s/docs.npy" % args.inputname)
topics = np.load("%s/topics.npy" % args.inputname)
for i in range(int(args.numtopics)):
    print "ITERATION %d" % i
    subclues = []
    for num in np.argsort(docs.transpose()[i])[::-1]:
        if docs.transpose()[i][num] < 0.5:
            continue
        subclues.append(clues[num])
    newvocab = Vocabulary()
    for clue in subclues:
        newvocab.add_question(clue)
    print newvocab.number
    matrix = lil_matrix((len(subclues), newvocab.number + 1))
    matrix = matrix.astype(np.int64)
    j = 0
    for clue in subclues:
        vector = newvocab.translate(clue)
        matrix[j] = vector
        j += 1
    print matrix.shape
    matrix = csr_matrix(matrix)
    #svd = TruncatedSVD(n_components=700)
    #docs = svd.fit_transform(matrix)
    model = lda.LDA(n_topics=10, n_iter=10000)
Exemple #39
0
        
def print_tree(s, depth=0):
    x = label(s)
    if x is None:
        x = label_word(s)
        
        if x is not None: 
            print '  '*depth+x
        return
    print '  '*depth+label(s)
    print_tree(s*vocab.parse('~L_'+x), depth+1)
    print_tree(s*vocab.parse('~R_'+x), depth+1)
        
    
    
vocab = Vocabulary(D)    

NEXT = vocab.parse('NEXT')

for category, items in words.items():
    for item in items:
        rules.append((category, [item]))
print rules    
    
sp_goal = vocab.parse('S')

sp_tree = None
    
for input in input_sentence:
    print 'parsing text:', input