Beispiel #1
0
 def configuration(cls,
                   plm=None,
                   method='lgesql',
                   table_path='data/tables.json',
                   tables='data/tables.bin',
                   db_dir='data/database'):
     cls.plm, cls.method = plm, method
     cls.grammar = ASDLGrammar.from_filepath(GRAMMAR_FILEPATH)
     cls.trans = TransitionSystem.get_class_by_lang('sql')(cls.grammar)
     cls.tables = pickle.load(open(tables,
                                   'rb')) if type(tables) == str else tables
     cls.evaluator = Evaluator(cls.trans, table_path, db_dir)
     if plm is None:
         cls.word2vec = Word2vecUtils()
         cls.tokenizer = lambda x: x
         cls.word_vocab = Vocab(
             padding=True,
             unk=True,
             boundary=True,
             default=UNK,
             filepath='./pretrained_models/glove.42b.300d/vocab.txt',
             specials=SCHEMA_TYPES)  # word vocab for glove.42B.300d
     else:
         cls.tokenizer = AutoTokenizer.from_pretrained(
             os.path.join('./pretrained_models', plm))
         cls.word_vocab = cls.tokenizer.get_vocab()
     cls.relation_vocab = Vocab(padding=False,
                                unk=False,
                                boundary=False,
                                iterable=RELATIONS,
                                default=None)
     cls.graph_factory = GraphFactory(cls.method, cls.relation_vocab)
Beispiel #2
0
def get_dataloader_for_train(args, tokenizer):
    data_path, raw_data_path = args.data_path, args.raw_data_path
    batch_size = args.batch_size
    if args.load_userdict:
        jieba.load_userdict(args.userdict)

    domain_map = Vocab.from_file(os.path.join(data_path, "domains.txt"))
    intent_map = Vocab.from_file(os.path.join(data_path, "intents.txt"))
    slots_map = Vocab.from_file(os.path.join(data_path, "slots.txt"))
    label_vocab = Vocab.from_file(os.path.join(data_path, "label_vocab.txt"))
    bin_label_vocab = Vocab.from_file(os.path.join(data_path, "bin_label_vocab.txt"))

    # train 
    all_train_data = [] 
    
    train_dom_data = read_all_train_data(
        os.path.join(raw_data_path, "source.json"), tokenizer,
        domain_map, intent_map, slots_map, label_vocab, bin_label_vocab)

    for dom, dom_data in train_dom_data.items():
        all_train_data.extend(dom_data)

    dev_sup_dom_data = read_support_data(
        os.path.join(raw_data_path, "dev", "support"), 
        tokenizer, domain_map, intent_map, slots_map, 
        label_vocab, bin_label_vocab)

    for i_dom, dom_data in dev_sup_dom_data.items():
        all_train_data.extend(dom_data)

    dataloader = thdata.DataLoader(dataset=Dataset(all_train_data), 
        batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    return dataloader
Beispiel #3
0
def get_dataloader_for_support(args, tokenizer, sep_dom=False):
    data_path, fin_data_path = args.data_path, args.fin_data_path
    batch_size = args.batch_size
    if args.load_userdict:
        jieba.load_userdict(args.userdict)

    domain_map = Vocab.from_file(os.path.join(data_path, "domains.txt"))
    intent_map = Vocab.from_file(os.path.join(data_path, "intents.txt"))
    slots_map = Vocab.from_file(os.path.join(data_path, "slots.txt"))
    label_vocab = Vocab.from_file(os.path.join(data_path, "label_vocab.txt"))
    bin_label_vocab = Vocab.from_file(os.path.join(data_path, "bin_label_vocab.txt"))

    sup_dom_data = read_support_data(
        os.path.join(fin_data_path, "support"), 
        tokenizer, domain_map, intent_map, slots_map, 
        label_vocab, bin_label_vocab)

    if not sep_dom:
        sup_data = []
        for dom_data in sup_dom_data.values():
            sup_data.extend(dom_data)
        
        suploader = thdata.DataLoader(
                            dataset=Dataset(sup_data), 
                            batch_size=batch_size, shuffle=True, 
                            collate_fn=collate_fn)
        return suploader
    else:
        suploaders = {}
        for dom, dom_data in sup_dom_data.items():
            suploaders[dom] = thdata.DataLoader(
                            dataset=Dataset(sup_dom_data[dom]), 
                            batch_size=batch_size, shuffle=True,
                            collate_fn=collate_fn)
        return suploaders
    def create_vocab(self):

        if self.is_training:
            if not os.path.exists(self.vocab_file_path):
                print("Creating vocab")
                self.vocab = Vocab(add_bos=False,
                                   add_eos=False,
                                   add_padding=False,
                                   min_count=self.min_count)

                for example in self.dataset:
                    self.vocab.add_tokenized_sentence(
                        example['tokens'][:self.train_max_length])

                self.vocab.finish()

                with open(self.vocab_file_path, 'wb') as f:
                    pickle.dump(self.vocab, f)
            else:
                with open(self.vocab_file_path, 'rb') as f:
                    self.vocab = pickle.load(f)

        else:
            print("Cargando vocab")
            with open(self.vocab_file_path, 'rb') as f:
                self.vocab = pickle.load(f)
Beispiel #5
0
def main(args):
    print("Load Tokenizer and Define Variables.")
    ## by arguments
    if args.lang == 'ko':
        tokenizer = ko.Tokenizer()
    else:
        raise ValueError(
            "Wrong arguments for --lang. Please pass 'ko' for --lang arguments."
        )
    processed_path = args.path

    ## etc
    emo = emoji.get_emoji_regexp()
    now = datetime.now()

    ## Load data for synthesio
    cols = ['Mention Title', 'Mention Content']
    df = pd.read_parquet('data/Korean.parquet', columns=cols)
    df = df.fillna('')
    docs = [doc for doc in df['Mention Title'] + ' ' + df['Mention Content']]

    print("Tokenize the documents and build the vocab.")
    with Pool(processes=os.cpu_count()) as pool:
        tokenized_docs = pool.map(tokenizer.tokenize, docs)

    token_counts = Counter(list(zip(*chain(*tokenized_docs)))[0]).most_common()
    vocab = Vocab(list_of_tokens=[
        token for token, count in token_counts if count >= int(args.min_count)
    ],
                  token_to_idx={
                      '[PAD]': 0,
                      '[UNK]': 1
                  })
    vocab.lexeme['is_Emoji'] = [
        True if emo.fullmatch(term) != None else False
        for term in vocab.idx_to_token
    ]
    vocab.lexeme['is_Digit'] = [
        True if re.fullmatch(r'[\d\,\.]+', term) != None else False
        for term in vocab.idx_to_token
    ]
    vocab.lexeme['is_Punct'] = [
        True
        if re.fullmatch(rf'[{string.punctuation}]+', term) != None else False
        for term in vocab.idx_to_token
    ]

    print(f"Build the new vocab vocab-size : {len(vocab)}")
    with open(f"{processed_path}/vocab-{now:%Y%m%d}.pkl", 'wb') as f:
        pickle.dump(vocab, f)
Beispiel #6
0
def train(params):
    assert params["mode"].lower() == "train", "change training mode to 'train'"

    vocab = Vocab(params["vocab_path"], params["vocab_size"])
    params['vocab_size'] = vocab.count
    # params["trained_epoch"] = get_train_msg()
    params["learning_rate"] *= np.power(0.9, params["trained_epoch"])

    # 构建模型
    print("Building the model ...")
    model = Seq2Seq(params)
    # 获取保存管理者
    checkpoint = tf.train.Checkpoint(Seq2Seq=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    SEQ2SEQ_CKPT,
                                                    max_to_keep=5)

    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")

    # 训练模型
    print("开始训练模型..")
    print("trained_epoch:", params["trained_epoch"])
    print("mode:", params["mode"])
    print("epochs:", params["epochs"])
    print("batch_size:", params["batch_size"])
    print("max_enc_len:", params["max_enc_len"])
    print("max_dec_len:", params["max_dec_len"])
    print("learning_rate:", params["learning_rate"])

    train_model(model, vocab, params, checkpoint_manager)
Beispiel #7
0
def build_vocab(df, vocab_path):
    print(f"building vocab ...")

    vocab_dict = {"<unk>": 1, "<eos>": 2, "<pad>": 3}
    vocab_set = []

    for row in tqdm(df.itertuples()):
        text = row.text.replace(" ", "")  # remove spaces

        phones = pyopenjtalk.g2p(text, join=False)
        # remove pause
        phones = [phone for phone in phones if phone != "pau"]

        for phone in phones:
            if phone not in vocab_set:
                vocab_set.append(phone)

    # alphabetical order
    vocab_set.sort()

    wlines = []
    for v in vocab_set:
        index = len(vocab_dict) + 1
        vocab_dict[v] = index

    for v, index in vocab_dict.items():
        wlines.append(f"{v} {index:d}\n")

    with open(vocab_path, "w", encoding="utf-8") as f:
        f.writelines(wlines)

    print(f"vocabulary saved to {vocab_path}")

    return Vocab(vocab_path)
Beispiel #8
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())
            self.typelib = self.typelib.fix()
        self.target_embedding = nn.Embedding(len(self.vocab.subtypes),
                                             config["target_embedding_size"])
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )
        # self.cached_decode_mask: Dict[int, torch.Tensor] = {}
        # self.size = torch.zeros(len(self.vocab.types), dtype=torch.long)

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            1,
            config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(decoder_layer, config["num_layers"],
                                          decoder_norm)
        self.output = nn.Linear(config["hidden_size"],
                                len(self.vocab.subtypes))

        self.config: Dict = config
Beispiel #9
0
    def __init__(self,
                 url: str,
                 config: Optional[Dict] = None,
                 percent: float = 1.0):
        # support wildcards
        urls = sorted(glob.glob(url))
        urls = urls[:int(percent * len(urls))]
        super().__init__(urls)
        if config:
            # annotate example for training
            from utils.vocab import Vocab

            self.vocab = Vocab.load(config["vocab_file"])
            with open(config["typelib_file"]) as type_f:
                self.typelib = TypeLibCodec.decode(type_f.read())
            self.max_src_tokens_len = config["max_src_tokens_len"]
            self.max_num_var = config["max_num_var"]
            annotate = self._annotate
            self.rename = config.get("rename", False)
            # sort = Dataset._sort
            sort = identity
        else:
            # for creating the vocab
            annotate = identity
            sort = identity
        self = (self.pipe(Dataset._file_iter_to_line_iter).map(
            Example.from_json).map(annotate).shuffle(
                Dataset.SHUFFLE_BUFFER).pipe(sort))
Beispiel #10
0
    def __init__(self, config):
        super(XfmrDecoder, self).__init__()

        self.vocab = Vocab.load(config["vocab_file"])
        with open(config["typelib_file"]) as type_f:
            self.typelib = TypeLibCodec.decode(type_f.read())

        retype_vocab_size = len(self.vocab.types)
        rename_vocab_size = len(self.vocab.names)
        self.target_embedding = nn.Embedding(
            retype_vocab_size + rename_vocab_size, config["target_embedding_size"]
        )
        self.target_transform = nn.Linear(
            config["target_embedding_size"] + config["hidden_size"],
            config["hidden_size"],
        )

        # concat variable encoding and previous target token embedding as input
        decoder_layer = TransformerDecoderLayer(
            config["hidden_size"],
            1,
            config["hidden_size"],
            config["dropout"],
            activation="gelu",
        )
        decoder_norm = LayerNorm(config["hidden_size"])
        self.decoder = TransformerDecoder(
            decoder_layer, config["num_layers"], decoder_norm
        )
        self.output = nn.Linear(
            config["hidden_size"], retype_vocab_size + rename_vocab_size
        )
        self.mem_mask = config["mem_mask"]
        self.config: Dict = config
        self.retype_vocab_size = retype_vocab_size
Beispiel #11
0
    def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'):
        super(Articles, self).__init__()
        '''Initialization'''
        self.vocab = Vocab(vocab_path, voc_size)
        self.tokenizer = data.get_tokenizer('basic_english')
        self.max_len_story = MAX_LEN_STORY
        self.max_len_highlight = MAX_LEN_HIGHLIGHT

        is_test = {
            False: os.path.join(data_dir, "train.pkl"),
            True: os.path.join(data_dir, "test.pkl")
        }
        self.data_path = is_test.get(test, "Wrong set name.")

        with open(self.data_path, 'rb') as f:
            self.data = load(f)
Beispiel #12
0
def get_dataloader_for_fs_eval(data_path,
                               raw_data_path,
                               eval_domains: list,
                               batch_size,
                               max_sup_ratio,
                               max_sup_size,
                               n_shots,
                               tokenizer,
                               return_suploader=False):
    domain_map = Vocab.from_file(os.path.join(data_path, "domains.txt"))
    intent_map = Vocab.from_file(os.path.join(data_path, "intents.txt"))
    slots_map = Vocab.from_file(os.path.join(data_path, "slots.txt"))
    label_vocab = Vocab.from_file(os.path.join(data_path, "label_vocab.txt"))
    bin_label_vocab = Vocab.from_file(
        os.path.join(data_path, "bin_label_vocab.txt"))

    # train
    all_train_data = read_all_train_data(
        os.path.join(raw_data_path, "source.json"), tokenizer, domain_map,
        intent_map, slots_map, label_vocab, bin_label_vocab)
    data = {k: v for k, v in all_train_data.items() if k in eval_domains}

    # eval support & query
    fs_data = []
    fs_sup_data = []
    for dom, dom_data in data.items():
        sup_size = max(min(int(max_sup_ratio * len(dom_data)), max_sup_size),
                       n_shots)
        sup_data, qry_data = separate_data_to_support_and_query(
            dom_data, sup_size)
        dom_data = collect_support_instances(sup_data, qry_data, int(n_shots))
        fs_data.extend(dom_data)
        if return_suploader:
            fs_sup_data.extend(sup_data)

    dataloader = thdata.DataLoader(dataset=Dataset(fs_data),
                                   batch_size=batch_size,
                                   shuffle=False,
                                   collate_fn=collate_fn)
    if return_suploader:
        suploader = thdata.DataLoader(dataset=Dataset(fs_sup_data),
                                      batch_size=batch_size,
                                      shuffle=True,
                                      collate_fn=collate_fn)
        return dataloader, suploader
    else:
        return dataloader
Beispiel #13
0
def vocabs_init(train_data: List[str]) -> Vocab:
    print("Constructing vocabularies...", flush=True)

    vocab = Vocab(train_data)

    print('len(labels_vocab): %d' % len(vocab))

    return vocab
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.K = args.K
        self.rnn_hidden = args.rnn_hidden
        self.max_sent_len = args.max_sent_len
        print("loading pretrained emb......")
        self.emb_matrix = np.load(args.dset_dir + '/' + args.dataset +
                                  '/embedding.npy')
        print("loading dataset vocab......")
        self.vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl')

        # create embedding layers
        self.emb = nn.Embedding(self.vocab.size,
                                args.emb_dim,
                                padding_idx=constant.PAD_ID)
        self.pos_emb = nn.Embedding(len(constant.POS_TO_ID),
                                    args.pos_dim) if args.pos_dim > 0 else None

        # initialize embedding with pretrained word embeddings
        self.init_embeddings()

        # dropout
        self.input_dropout = nn.Dropout(args.input_dropout)

        # GRU for P(Trc|S,Y')
        self.GRU_mean_rc = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)
        self.GRU_std_rc = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)

        # GRU for P(Tner|S,Y')
        self.GRU_mean_ner = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)
        self.GRU_std_ner = torch.nn.GRUCell(
            len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID),
            self.rnn_hidden * 2)

        # define r
        self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))

        # define encoder for the sharing representations S
        self.BiLSTM = LSTMRelationModel(args)

        # classifer
        self.Lr = nn.Linear(4 * self.rnn_hidden, 2 * self.rnn_hidden)
        self.Cr = nn.Linear(2 * self.rnn_hidden, len(constant.LABEL_TO_ID))
        self.Cg = nn.Linear(2 * self.rnn_hidden, len(constant.BIO_TO_ID))

        # Fn
        self.logsoft_fn1 = nn.LogSoftmax(dim=2)
        self.logsoft_fn2 = nn.LogSoftmax(dim=3)
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', required=True)
    parser.add_argument('--vocab', required=True)
    parser.add_argument('--vocab-size', required=True, type=int)
    parser.add_argument('--max-length', required=True, type=int)
    parser.add_argument('--out', required=True)
    args = parser.parse_args()

    word_vocab = Vocab.from_file(path=args.vocab, add_pad=True, add_unk=True,
                                 max_size=args.vocab_size)
    label_dict = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
    label_vocab = Vocab(vocab_dict=label_dict, add_pad=False, add_unk=False)
    data_reader = SNLIDataset(
        data_path=args.data, word_vocab=word_vocab, label_vocab=label_vocab,
        max_length=args.max_length)
    with open(args.out, 'wb') as f:
        pickle.dump(data_reader, f)
Beispiel #16
0
    def build(cls, config):
        params = util.update(cls.default_params(), config)

        vocab = Vocab.load(params['vocab_file'])
        model = cls(params['ast_node_encoding_size'], params['hidden_size'],
                    params['dropout'], vocab)
        model.config = params

        return model
    def __init__(self, config):
        super().__init__()

        self.vocab = vocab  = Vocab.load(config['vocab_file'])
        self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size'])
        self.config = config

        self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size'])

        if self.config['transformer'] == 'none':
            dropout = config['dropout']
            self.lstm_encoder = nn.LSTM(input_size=self.src_word_embed.embedding_dim,
                                        hidden_size=config['source_encoding_size'] // 2, num_layers=config['num_layers'],
                                        batch_first=True, bidirectional=True, dropout=dropout)

            self.dropout = nn.Dropout(dropout)

        elif self.config['transformer'] == 'bert':
            self.vocab_size = len(self.vocab.source_tokens) + 1

            state_dict = torch.load('saved_checkpoints/bert_2604/bert_pretrained_epoch_23_batch_140000.pth')

            keys_to_delete = ["cls.predictions.bias", "cls.predictions.transform.dense.weight", "cls.predictions.transform.dense.bias", "cls.predictions.transform.LayerNorm.weight",
                            "cls.predictions.transform.LayerNorm.bias", "cls.predictions.decoder.weight", "cls.predictions.decoder.bias",
                            "cls.seq_relationship.weight", "cls.seq_relationship.bias"]

            from collections import OrderedDict
            new_state_dict = OrderedDict()
            for k, v in state_dict['model'].items():
                if k in keys_to_delete: continue
                name = k[5:] # remove `bert.`
                new_state_dict[name] = v

            bert_config = BertConfig(vocab_size=self.vocab_size, max_position_embeddings=512, num_hidden_layers=6, hidden_size=256, num_attention_heads=4)
            self.bert_model = BertModel(bert_config)
            self.bert_model.load_state_dict(new_state_dict)

        elif self.config['transformer'] == 'xlnet':
            self.vocab_size = len(self.vocab.source_tokens) + 1

            state_dict = torch.load('saved_checkpoints/xlnet_2704/xlnet1_pretrained_epoch_13_iter_500000.pth')

            keys_to_delete = ["lm_loss.weight", "lm_loss.bias"]

            from collections import OrderedDict
            new_state_dict = OrderedDict()
            for k, v in state_dict['model'].items():
                if k in keys_to_delete: continue
                if k[:12] == 'transformer.': name = k[12:]
                else:                       name = k
                new_state_dict[name] = v

            xlnet_config = XLNetConfig(vocab_size=self.vocab_size, d_model=256, n_layer=12)
            self.xlnet_model = XLNetModel(xlnet_config)
            self.xlnet_model.load_state_dict(new_state_dict)
        else:
            print("Error! Unknown transformer type '{}'".format(self.config['transformer']))
def covar_analysis(args):
    model = GaussianBilinearModel.load_model(args.model)
    rel_vocab = Vocab.load(args.relation)
    rel_mats = model.relation_mats
    scores = [abs(np.linalg.det(mat)) for mat in rel_mats]

    sort_idxs = np.argsort(scores)[::-1]
    for idx in sort_idxs:
        print('{} : {}'.format(rel_vocab.get_word(idx), scores[idx]))
Beispiel #19
0
    def build(self, corpus, min_freq=1, embed=None):
        sequences = getattr(corpus, self.name)
        counter = Counter(char for sequence in sequences for token in sequence
                          for char in self.transform(token))
        self.vocab = Vocab(counter, min_freq, self.specials)

        if not embed:
            self.embed = None
        else:
            tokens = self.transform(embed.tokens)
            # if the `unk` token has existed in the pretrained,
            # then replace it with a self-defined one
            if embed.unk:
                tokens[embed.unk_index] = self.unk

            self.vocab.extend(tokens)
            self.embed = torch.zeros(len(self.vocab), embed.dim)
            self.embed[self.vocab.token2id(tokens)] = embed.vectors
Beispiel #20
0
def create_vocab(data_path):
    wd_vocab = Vocab(min_count=3, bos=None, eos=None)
    lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None)
    assert os.path.exists(data_path)
    with open(data_path, 'r', encoding='utf-8') as fin:
        loader = map(lambda x: x.strip().split('|||'), fin)
        for lbl, data_item in loader:
            wds = data_item.strip().split(' ')
            wd_vocab.add(wds)
            lbl_vocab.add(lbl.strip())
    return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})
    def __init__(self, args):
        self.args = args

        self.epoch = args.epoch
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.K = args.K
        self.num_avg = args.num_avg
        self.global_iter = 0
        self.global_epoch = 0
        self.log_file = args.log_file

        # Network & Optimizer
        self.toynet = ToyNet(args).cuda()
        self.optim = optim.Adam(self.toynet.parameters(), lr=self.lr)

        self.ckpt_dir = Path(args.ckpt_dir)
        if not self.ckpt_dir.exists():
            self.ckpt_dir.mkdir(parents=True, exist_ok=True)
        self.load_ckpt = args.load_ckpt
        if self.load_ckpt != '': self.load_checkpoint(self.load_ckpt)

        # loss function
        self.ner_lossfn = nn.NLLLoss(reduction='sum')
        self.rc_lossfn = nn.BCELoss(reduction='sum')

        # History
        self.history = dict()
        # class loss
        self.history['ner_train_loss1'] = []
        self.history['rc_train_loss1'] = []
        self.history['ner_test_loss1'] = []
        self.history['rc_test_loss1'] = []
        self.history['ner_train_loss2'] = []
        self.history['rc_train_loss2'] = []
        self.history['ner_test_loss2'] = []
        self.history['rc_test_loss2'] = []
        self.history['precision_test'] = []
        self.history['recall_test'] = []
        self.history['F1_test'] = []
        # info loss
        self.history['info_train_loss'] = []
        self.history['info_test_loss'] = []

        # Dataset
        vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl')
        self.data_loader = dict()
        self.data_loader['train'] = Dataloader(
            args.dset_dir + '/' + args.dataset + '/train.json',
            args.batch_size, vars(args), vocab)
        self.data_loader['test'] = Dataloader(args.dset_dir + '/' +
                                              args.dataset + '/test.json',
                                              args.batch_size,
                                              vars(args),
                                              vocab,
                                              evaluation=True)
Beispiel #22
0
def evaluate_model(evalparams):

    torch.manual_seed(evalparams.seed)
    random.seed(1234)
    if evalparams.cpu:
        evalparams.cuda = False
    elif evalparams.cud:
        torch.cuda.manual_seed(args.seed)

    # load opt
    print(evalparams.model_dir, evalparams.model)
    #     model_file = evalparams.model_dir + "/" + evalparams.model
    model_file = 'best_model.pt'
    print("Loading model from {}".format(model_file))
    opt = torch_utils.load_config(model_file)
    model = RelationModel(opt)
    model.load(model_file)

    # load vocab
    vocab_file = evalparams.model_dir + '/vocab.pkl'
    vocab = Vocab(vocab_file, load=True)
    assert opt[
        'vocab_size'] == vocab.size, "Vocab size must match that in the saved model."

    # load data
    data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset)
    print("Loading data from {} with batch size {}...".format(
        data_file, opt['batch_size']))
    batch = DataLoader(data_file,
                       opt['batch_size'],
                       opt,
                       vocab,
                       evaluation=True)

    helper.print_config(opt)
    id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])

    predictions = []
    all_probs = []
    for i, b in enumerate(batch):
        preds, probs, _ = model.predict(b)
        predictions += preds
        all_probs += probs
    predictions = [id2label[p] for p in predictions]
    p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True)

    # save probability scores
    if len(evalparams.out) > 0:
        helper.ensure_dir(os.path.dirname(evalparams.out))
        with open(evalparams.out, 'wb') as outfile:
            pickle.dump(all_probs, outfile)
        print("Prediction scores saved to {}.".format(evalparams.out))

    print("Evaluation ended.")

    return (batch.gold(), predictions, model)
    def build(cls, config):
        params = util.update(cls.default_params(), config)

        vocab = Vocab.load(params['vocab_file'])
        model = cls(params['variable_encoding_size'], params['hidden_size'],
                    params['dropout'], params['tie_embedding'],
                    params['input_feed'], vocab)
        model.config = params

        return model
Beispiel #24
0
def load_word_vector(path):
    """
    loading word vector(this project employs GLOVE word vector), save GLOVE word, vector as file
    respectively
    :param path: GLOVE word vector path
    :return: glove vocab,: vocab object, vector(numpy array, of shape(words_num, word_dim))
    """
    base = os.path.splitext(os.path.basename(path))[0]
    glove_vocab_path = os.path.join('../data/glove/', base + '.vocab')
    glove_vector_path = os.path.join('../data/glove/', base + '.path')
    # haved loaded word vector
    if os.path.isfile(glove_vocab_path) and os.path.isfile(glove_vector_path):
        print('======> File found, loading memory <=====!')
        vocab = Vocab(glove_vocab_path)
        vector = np.load(glove_vector_path)
        return vocab, vector

    print('=====>Loading glove word vector<=====')
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        contents = f.readline().rstrip('\n').split(' ')
        word_dim = len(contents[1:])
        count = 1
        for line in f:
            count += 1

    vocab = [None] * count
    vector = np.zeros((count, word_dim))
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        idx = 0
        for line in f:
            contents = line.rstrip('\n').split(' ')
            vocab[idx] = contents[0]
            vector[idx] = np.array(list(map(float, contents[1:])), dtype=float)
            idx += 1
    assert count == idx
    with open(glove_vector_path, 'w', encoding='utf8', errors='ignore') as f:
        for token in vocab:
            f.write(token + '\n')

    vocab = Vocab(glove_vocab_path)
    torch.save(vector, glove_vector_path)
    return vocab, vector
Beispiel #25
0
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.load(vocab_file)

    if word2vec:
        w2vec = ModelReader(word2vec)
    else:
        w2vec = None

    train.train(db, entity_db, vocab, w2vec, **kwargs)
Beispiel #26
0
class Articles(torch.utils.data.Dataset):
    def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'):
        super(Articles, self).__init__()
        '''Initialization'''
        self.vocab = Vocab(vocab_path, voc_size)
        self.tokenizer = data.get_tokenizer('basic_english')
        self.max_len_story = MAX_LEN_STORY
        self.max_len_highlight = MAX_LEN_HIGHLIGHT

        is_test = {
            False: os.path.join(data_dir, "train.pkl"),
            True: os.path.join(data_dir, "test.pkl")
        }
        self.data_path = is_test.get(test, "Wrong set name.")

        with open(self.data_path, 'rb') as f:
            self.data = load(f)

    def __len__(self):
        '''return the number of articles'''
        return len(self.data)

    def __getitem__(self, idx):
        '''generates one sample of data'''
        X, y = self.data[idx]['story'], self.data[idx]['highlights']
        X_tokenized, y_tokenized = list(map(lambda x: self.tokenize(x),
                                            [X, y]))
        X_padded = self.padding(X_tokenized)
        y_padded = self.padding(y_tokenized, sequence_type="highlight")
        return X_padded, y_padded

    def tokenize(self, sequence):
        '''tokenize a sequence'''
        tokenized_sequence = []
        tokenized_sequence.extend(
            [token for token in self.tokenizer(sequence)])
        tokenized_sequence.append(STOP_TOKEN)
        return tokenized_sequence

    def words_to_index(self, tokenized_sequence):
        '''return list of index of tokens in the sequence'''
        return self.vocab.sequence_2_id(tokenized_sequence)

    def padding(self, sequence, sequence_type="story"):
        '''pad the sequence with the corresponding length'''
        if sequence_type == "story":
            max_len = self.max_len_story
        else:
            max_len = self.max_len_highlight
        if len(sequence) > max_len:
            sequence = sequence[:max_len]
        else:
            sequence += [PAD_TOKEN] * (max_len - len(sequence))
        return sequence
Beispiel #27
0
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.load(vocab_file)

    if word2vec:
        w2vec = ModelReader(word2vec)
    else:
        w2vec = None

    train.train(db, entity_db, vocab, w2vec, **kwargs)
Beispiel #28
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.K = args.K
        self.rnn_hidden = args.rnn_hidden
        self.max_sent_len = args.max_sent_len
        print("loading pretrained emb......")
        self.emb_matrix = np.load(args.dset_dir+'/'+args.dataset+'/embedding.npy')
        print("loading dataset vocab......")
        self.vocab = Vocab(args.dset_dir+'/'+args.dataset+'/vocab.pkl')

        # create embedding layers
        self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID)
        self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None

        # initialize embedding with pretrained word embeddings
        self.init_embeddings()

        # dropout
        self.input_dropout = nn.Dropout(args.input_dropout)

        # define r rc distribution
        self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K))
        self.r_diag_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        # orthogonal initialization r_std_rc
        for i in range(self.max_sent_len):
            nn.init.orthogonal_(self.r_std_rc[i], gain=1)

        # define r ner distribution
        self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K))
        self.r_diag_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K))
        # orthogonal initialization r_std_ner
        for i in range(self.max_sent_len):
            nn.init.orthogonal_(self.r_std_ner[i], gain=1)

        # define encoder
        self.BiLSTM = LSTMRelationModel(args)
        self.hidden2mean_rc = nn.Linear(self.rnn_hidden*2, self.K)
        self.hidden2std_rc = nn.Linear(self.rnn_hidden*2, self.K)
        # ner encoder
        self.hidden2mean_ner = nn.Linear(self.rnn_hidden*2, self.K)
        self.hidden2std_ner = nn.Linear(self.rnn_hidden*2, self.K)

        # decoder
        self.rc_lr = nn.Linear(args.K*2, args.K)
        self.rc_cla = nn.Linear(args.K, len(constant.LABEL_TO_ID))
        self.ner_cla = nn.Linear(args.K, len(constant.BIO_TO_ID))
        self.logsoft_fn = nn.LogSoftmax(dim=3)

        # mse loss 
        self.loss_fn = torch.nn.MSELoss(reduction='sum')
    def __init__(self,
                 source_name,
                 target_name,
                 max_length=300,
                 source_vocab=None,
                 target_vocab=None):

        self.data_source = self.read_file(source_name)
        self.data_target = self.read_file(target_name)

        self.max_length = max_length

        self.source_vocab = source_vocab
        if source_vocab == None:
            self.source_vocab = Vocab()
            self.source_vocab.build_vocab([source_name])

        self.target_vocab = target_vocab
        if target_vocab == None:
            self.target_vocab = Vocab()
            self.target_vocab.build_vocab([target_name])
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            none_label = 'O'
            vocab_label.add_word(none_label)

        labels = []
        for sent in sents:
            if sent.has_prds:
                for prop in sent.prd_bio_labels:
                    labels += prop
        cnt = Counter(labels)
        labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in labels:
            vocab_label.add_word(label)

        return vocab_label
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            if self.argv.data_type == 'conll05':
                core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
            else:
                core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
            for label in core_labels:
                vocab_label.add_word(label)

        bio_labels = []
        for sent in sents:
            for props in sent.prd_bio_labels:
                bio_labels += props
        cnt = Counter(bio_labels)
        bio_labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in bio_labels:
            if not label.endswith('-V') and len(label) > 1:
                vocab_label.add_word(label[2:])

        return vocab_label
Beispiel #32
0
def build_vocab(db_file, entity_db_file, out_file, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.build(db, entity_db, **kwargs)
    vocab.save(out_file)