Example #1
0
def create_vocab(data, cfg, dataset_dir):
    print('[*] Creating word vocab')
    words = Counter()
    for m, d in data.items():
        bar = tqdm(d,
                   desc='[*] Collecting word tokens form {} data'.format(m),
                   dynamic_ncols=True)
        for dd in bar:
            words.update([w.lower() for w in dd['text']])
        bar.close()
    tokens = [w for w, _ in words.most_common(cfg.word.size)]
    word_vocab = Vocab(tokens, **cfg.word)
    word_vocab_path = (dataset_dir / 'word.pkl')
    with word_vocab_path.open(mode='wb') as f:
        pickle.dump(word_vocab, f)
    print('[-] Word vocab saved at {}\n'.format(word_vocab_path))

    print('[*] Creating char vocab')
    char_vocab = Vocab(list(string.printable), **cfg.char)
    char_vocab_path = (dataset_dir / 'char.pkl')
    with char_vocab_path.open(mode='wb') as f:
        pickle.dump(char_vocab, f)
    print('[-] Char vocab saved to {}\n'.format(char_vocab_path))

    return word_vocab, char_vocab
Example #2
0
def load_word_vectors(path):
    if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'):
        print('==> File found, loading to memory')
        vectors = torch.load(path + '.pth')
        vocab = Vocab(filename=path + '.vocab')
        return vocab, vectors
    # saved file not found, read from txt file
    # and create tensors for word vectors
    print('==> File not found, preparing, be patient')
    count = sum(1 for line in open(path + '.txt', 'r', encoding='utf8', errors='ignore'))
    with open(path + '.txt', 'r') as f:
        contents = f.readline().rstrip('\n').split(' ')
        dim = len(contents[1:])
    words = [None] * (count)
    vectors = torch.zeros(count, dim, dtype=torch.float, device='cpu')
    with open(path + '.txt', 'r', encoding='utf8', errors='ignore') as f:
        idx = 0
        for line in f:
            contents = line.rstrip('\n').split(' ')
            words[idx] = contents[0]
            values = list(map(float, contents[1:]))
            vectors[idx] = torch.tensor(values, dtype=torch.float, device='cpu')
            idx += 1
    with open(path + '.vocab', 'w', encoding='utf8', errors='ignore') as f:
        for word in words:
            f.write(word + '\n')
    vocab = Vocab(filename=path + '.vocab')
    torch.save(vectors, path + '.pth')
    return vocab, vectors
 def build_vocab(self):
     utils.build_vocab([self.path], QAConfig.vocab)
     return Vocab(filename=QAConfig.vocab,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
Example #4
0
def create_vocab(data, cfg):
    print('[*] Creating word vocab')
    words = Counter()

    bar = tqdm(data, desc='[*] Collecting word tokens', dynamic_ncols=True)

    for dd in bar:
        words.update([w for w in dd])
    bar.close()

    tokens = [w for w, _ in words.most_common(cfg.word.size)]
    word_vocab = Vocab(
        tokens,
        **cfg.word,
    )
    char_vocab = Vocab(list(string.printable), **cfg.char)

    return word_vocab, char_vocab
Example #5
0
    def __init__(self,
                 trainset_path,
                 testset_path,
                 vocab_path,
                 dataset_name='',
                 remove_entity_mention=False,
                 remove_stop_words=False):
        self.config = config[dataset_name]
        self.train_set, self.train_corpus = self.load_dataset(
            trainset_path, remove_entity_mention, remove_stop_words)
        self.test_set, self.test_corpus = self.load_dataset(
            testset_path, remove_entity_mention, remove_stop_words)

        self.corpus = self.train_corpus + self.test_corpus
        # if not os.path.isfile(vocab_path):
        #     self.__build_vocab(self.corpus, vocab_path)
        if os.path.isfile(vocab_path):
            self.vocab = Vocab(filename=vocab_path,
                               data=['<ukn>', '<ent>', '<num>'])
            self.unknown = self.vocab.getIndex('<ukn>')
            self.word_vectorizer = Glove(self.vocab, config['glove_path'],
                                         self.config['emb'])

            for qa_row in self.train_set + self.test_set:
                for relation in qa_row.sparql.relations:
                    relation.coded = self.decode(relation)
            # self.__update_relations_emb()

            self.coded_train_corpus = [[
                self.vocab.getIndex(word, self.unknown) for word in tokens
            ] for tokens in self.train_corpus]
            self.coded_test_corpus = [[
                self.vocab.getIndex(word, self.unknown) for word in tokens
            ] for tokens in self.test_corpus]
            self.vocab_path = vocab_path

        self.one_hop = None
        if os.path.isfile(self.config['entity_one_hop']):
            with open(self.config['entity_one_hop'], 'rb') as f:
                self.one_hop = pk.load(f)
def create_vocab(data,cfg,dataset_dir):
	print('[*] Creating word vocab')
	dict_words = Counter()
	for m, d in data.items():
		for words in tqdm(d, desc='[*] creating word vocab', dynamic_ncols=True):
			dict_words.update(words)
	dict_words = Counter({word : dict_words[word] for word in dict_words if dict_words[word] > 3})
	tokens = [w for w, _ in dict_words.most_common(cfg.word.size)]
	word_vocab = Vocab(tokens,**cfg.word)
	print("[*] The word vocabulary size is " + str(word_vocab.__len__()))
	word_vocab_path = (dataset_dir / 'word.pkl')
	with word_vocab_path.open(mode='wb') as f:
		pickle.dump(word_vocab,f)
	print('[-] Word vocab saved at {}\n'.format(word_vocab_path))

	print('[*] Creating char vocab')
	dict_chars = Counter()
	for m,d in data.items():
		for words in tqdm(d, desc='[*] creating char vocab', dynamic_ncols=True):
			for word in words:
				if word == '<BOS>' or word == '<EOS>':
					dict_chars.update([word])
					continue
				dict_chars.update(word)
	dict_chars = Counter({char : dict_chars[char] for char in dict_chars if dict_chars[char] > 1000})
	tokens = [c for c, _ in dict_chars.most_common(cfg.char.size)]
	char_vocab = Vocab(tokens,**cfg.char)
	print("[*] The char vocabulary size is " + str(char_vocab.__len__()))
	char_vocab_path = (dataset_dir / 'char.pkl')
	with char_vocab_path.open(mode='wb') as f:
		pickle.dump(char_vocab, f)
	print('[-] Char vocab saved to {}\n'.format(char_vocab_path))

	return word_vocab, char_vocab
Example #7
0
 def load_word_vectors(self, path):
     """
     loading GLOVE word vectors
         if .pth file is found, will load that
         else will load from .txt file & save
     :param path:
     :return:
     """
     if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'):
         print('==> File found, loading to memory')
         vectors = torch.load(path + '.pth')
         vocab = Vocab(filename=path + '.vocab')
         return vocab, vectors
     # saved file not found, read from txt file
     # and create tensors for word vectors
     print('==> File not found, preparing, be patient')
     print(path + '.txt')
     count = sum(1 for line in open(path + '.txt', encoding="utf-8"))
     with open(path + '.txt', 'r') as f:
         contents = f.readline().rstrip('\n').split(' ')
         dim = len(contents[1:])
     words = [None] * (count)
     vectors = torch.zeros(count, dim)
     with open(path + '.txt', 'r', encoding="utf-8") as f:
         idx = 0
         for line in f:
             contents = line.rstrip('\n').split(' ')
             words[idx] = contents[0]
             vectors[idx] = torch.Tensor(list(map(float, contents[1:])))
             idx += 1
     with open(path + '.vocab', 'w', encoding="utf-8") as f:
         for word in words:
             f.write(word + '\n')
     vocab = Vocab(filename=path + '.vocab')
     torch.save(vectors, path + '.pth')
     return vocab, vectors
Example #8
0
class Base_Dataset:
    def __init__(self,
                 trainset_path,
                 testset_path,
                 vocab_path,
                 dataset_name='',
                 remove_entity_mention=False,
                 remove_stop_words=False):
        self.config = config[dataset_name]
        self.train_set, self.train_corpus = self.load_dataset(
            trainset_path, remove_entity_mention, remove_stop_words)
        self.test_set, self.test_corpus = self.load_dataset(
            testset_path, remove_entity_mention, remove_stop_words)

        self.corpus = self.train_corpus + self.test_corpus
        # if not os.path.isfile(vocab_path):
        #     self.__build_vocab(self.corpus, vocab_path)
        if os.path.isfile(vocab_path):
            self.vocab = Vocab(filename=vocab_path,
                               data=['<ukn>', '<ent>', '<num>'])
            self.unknown = self.vocab.getIndex('<ukn>')
            self.word_vectorizer = Glove(self.vocab, config['glove_path'],
                                         self.config['emb'])

            for qa_row in self.train_set + self.test_set:
                for relation in qa_row.sparql.relations:
                    relation.coded = self.decode(relation)
            # self.__update_relations_emb()

            self.coded_train_corpus = [[
                self.vocab.getIndex(word, self.unknown) for word in tokens
            ] for tokens in self.train_corpus]
            self.coded_test_corpus = [[
                self.vocab.getIndex(word, self.unknown) for word in tokens
            ] for tokens in self.test_corpus]
            self.vocab_path = vocab_path

        self.one_hop = None
        if os.path.isfile(self.config['entity_one_hop']):
            with open(self.config['entity_one_hop'], 'rb') as f:
                self.one_hop = pk.load(f)

    def decode(self, relation, max_length=3):
        idxs = self.vocab.convertToIdx(
            map(str.lower, relation.tokens[:max_length]), self.unknown)
        length = len(idxs)
        if len(idxs) < max_length:
            idxs = idxs + [0] * (max_length - len(idxs))
        return torch.LongTensor(idxs), length

    def load_dataset(self, dataset_path, remove_entity_mention,
                     remove_stop_words):
        return [], []

    def __load_candidate_relations(self):
        vocab = set()
        # if not os.path.exists(self.config['rel2id']):
        #     for qa_row in self.train_set + self.test_set:
        #         for relation in qa_row.sparql.relations:
        #             vocab |= set(map(str.lower, relation.tokens))
        #     return vocab

        with open(self.config['rel2id'], 'rb') as f_h:
            rel2id = pk.load(f_h, encoding='latin1')

        for item_id, item in rel2id.items():
            words = [word.lower().replace('.', '') for word in item[2]]
            vocab |= set(words)

        if os.path.isfile(self.config['entity_one_hop']):
            with open(self.config['entity_one_hop'], 'rb') as f:
                one_hop = pk.load(f)

            print(len(vocab))
            for entity, uris in one_hop.items():
                for idx in range(len(uris)):
                    uri, label = uris[idx][:2]
                    label = re.sub(r"([A-Z])", r" \1",
                                   label).replace('_', ' ').replace('.', ' ')
                    words = list(map(str.lower, label.split(' ')))
                    vocab |= set(words)
            print(len(vocab))

        return vocab

    def __update_relations_emb(self):
        emb_shape = self.word_vectorizer.emb.shape
        emb = nn.Embedding(emb_shape[0],
                           emb_shape[1],
                           padding_idx=0,
                           sparse=False)
        emb.weight.data.copy_(self.word_vectorizer.emb)
        if torch.cuda.is_available():
            emb.cuda()

        with open(self.config['rel2id'], 'rb') as f_h:
            rel2id = pk.load(f_h, encoding='latin1')

        ## Need to fix cases where there are non-alphabet chars in the label
        max_length = 3
        for item_id, item in rel2id.items():
            if len(item[2]) > max_length:
                idxs = []
            else:
                idxs = [
                    self.vocab.getIndex(
                        word.lower().replace('.', '') if not word.replace(
                            '.', '').replace('(', '').isdigit() else '<num>')
                    for word in item[2]
                ]
                idxs = [id for id in idxs if id is not None]
            length = len(idxs)
            if length == 0:
                length = 1
            if len(idxs) < max_length:
                idxs = idxs + [0] * (max_length - len(idxs))
            idxs = torch.LongTensor(idxs)
            item[5] = idxs
            if len(item) == 6:
                item.append(length)
            else:
                item[6] = length
        with open(self.config['rel2id'], 'wb') as f_h:
            pk.dump(rel2id, f_h)

    def __build_vocab(self, lines, vocab_path):
        vocab = set()
        for tokens in lines:
            vocab |= set(tokens)
        relations_vocab = self.__load_candidate_relations()
        vocab |= relations_vocab
        vocab = [
            w for w in vocab
            if not w.replace('.', '').replace('(', '').isdigit()
        ]
        if '<ent>' in vocab:
            vocab.remove('<ent>')
        with open(vocab_path, 'w', encoding='utf-8') as f:
            for token in sorted(vocab):
                f.write(token + '\n')

    def find_one_hop_relations(self, entities):
        extra_candidates = []
        if self.one_hop is not None:
            for entity in entities:
                if entity in self.one_hop:
                    extra_candidates.extend(self.one_hop[entity])
        return extra_candidates
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    # debugging args
    logger.debug(args)
    # set seed for
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True

    # get vocab object from vocab file previously written
    imdb_vocab_file = classificationConfig.vocab
    vocab = Vocab(filename=imdb_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    logger.debug('==> imdb vocabulary size : %d ' % vocab.size())
    emb_file = classificationConfig.embed
    emb = torch.load(emb_file)

    ## built treeLSTM model
    tree_model = TreeLSTM(vocab.size(), args.input_dim, args.mem_dim,
                          args.hidden_dim, args.num_classes, args.sparse,
                          args.freeze_embed, device)
    criterion = nn.CrossEntropyLoss()
    tree_model.to(device), criterion.to(device)
    tree_model.emb.weight.data.copy_(emb)
    with open('%s.pt' % os.path.join(args.save, args.expname), 'rb') as f:
        tree_model.load_state_dict(torch.load(f)['model'])
Example #10
0
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    # debugging args
    logger.debug(args)
    # set seed for
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True

    # get vocab object from vocab file previously written
    imdb_vocab_file = classificationConfig.vocab
    vocab = Vocab(filename=imdb_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    logger.debug('==> imdb vocabulary size : %d ' % vocab.size())
    emb_file = classificationConfig.embed
    emb = torch.load(emb_file)

    # dev_dir = classificationConfig.token_file_labels[1]
    # dev_file = os.path.join(Global.external_tools, 'imdb_end2end_dev.pth')
    # if os.path.isfile(dev_file):
    #     dev_data = torch.load(dev_file)
    # else:
    #     dev_data = CommonDataset(dev_dir, vocab, device)
    #     torch.save(dev_data, dev_file)
    # logger.debug('==> Size of dev data     : %d ' % len(dev_data))
Example #11
0
    ## build vocab
    token_files = []
    for k in ['pos', 'neg']:
        token_files.extend([
            os.path.join(token_file_label, k + ".json")
            for token_file_label in classificationConfig.token_file_labels
        ])
    # imdb_vocab_file = os.path.join(args.data, 'imdb.vocab')
    print('token_files', token_files)
    imdb_vocab_file = classificationConfig.vocab
    utils.build_vocab(token_files, imdb_vocab_file)
    # get vocab object from vocab file previously written
    vocab = Vocab(filename=imdb_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    logger.debug('==> imdb vocabulary size : %d ' % vocab.size())

    ## build embedding of vocab
    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    # emb_file = os.path.join(Global.external_tools, 'imdb_embed.pth')
    emb_file = classificationConfig.embed
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = utils.load_word_vectors(
            classificationConfig.glove)
Example #12
0
        with open(config['dbpedia']['relations'], 'r',
                  encoding='utf-8') as file_handler:
            for line in tqdm(file_handler):
                json_object = json.loads(line)['_source']
                uri = json_object['uri']
                if 'http://dbpedia.org/' in uri:
                    uri = URI(uri)
                    vocab |= set(uri.tokens)
        print(len(vocab))
        vocab_list = [URI.normalize(word) for word in vocab]
        vocab = set([word for words in vocab_list for word in words])
        with open(config['vocab'], 'w', encoding='utf-8') as f:
            for token in sorted(vocab):
                f.write(token + '\n')

    vocab = Vocab(config['vocab'], data=['<ukn>', '<ent>', '<num>'])
    word_vectorizer = Glove(vocab, config['glove_path'], config['emb'])

    coded_labels = {}
    max_length = 3
    with open(config['dbpedia']['relations'], 'r',
              encoding='utf-8') as file_handler:
        for line in tqdm(file_handler):
            json_object = json.loads(line)['_source']
            uri = json_object['uri']
            if 'http://dbpedia.org/' in uri:
                uri = URI(uri)
                if uri.raw_uri not in coded_labels:
                    idxs = vocab.convertToIdx(uri.tokens, '')[:max_length]
                    length = len(idxs)
                    if len(idxs) < max_length:
Example #13
0
                           config['dbpedia']['relations'],
                           index_name=args.index_name)
            bulk_data = []
            manual_list = [{
                'uri': 'http://dbpedia.org/ontology/TelevisionShow',
                'label': 'show'
            }]
            for item in manual_list:
                data_dict = {
                    'key': item['uri'],
                    'dtype': 'uri',
                    'label': item['label']
                }
                op_dict = {
                    "index": {
                        "_index": args.index_name,
                        "_type": 'resources'
                    }
                }
                bulk_data.append(op_dict)
                bulk_data.append(data_dict)
            e.bulk_indexing(args.index_name,
                            delete_index=False,
                            index_config=index_config,
                            bulk_data=bulk_data)

            vocab = Vocab(filename=config['lc_quad']['vocab'],
                          data=['<ent>', '<num>'])

    print(e.search_index(args.search, args.index_name, size=args.size))