Esempio n. 1
0
def _get_data(file, classes, is_file_content=False):
    if not is_file_content:
        with open(file) as f:
            file_content = f.read()
    else:
        file_content = file
    data = [line.strip() for line in file_content.split('\n') if line.strip()]

    rel_labels = {}
    for line in data:
        rel_str = line.rsplit('(', 1)[0]

        entity_str = line.rsplit('(', 1)[-1][:-1]
        ent0_str, ent1_str, *reverse = entity_str.split(',')
        doc_id = ent0_str.split('.', 1)[0]
        ent0 = int(ent0_str.split('.', 1)[-1]) - 1
        ent1 = int(ent1_str.split('.', 1)[-1]) - 1
        if reverse:
            key = (doc_id, (ent1, ent0))
        else:
            key = (doc_id, (ent0, ent1))
        if rel_str not in classes:
            from efficiency.log import show_var
            show_var(['file', 'data'])
            print('[Warn] rel_str({}) is not among pre-defined classes({})'.
                  format(line, classes))
        else:
            rel_labels[key] = rel_str
    # assert len(rel_labels) == len(data), "# lines does not match with # data"

    return rel_labels
Esempio n. 2
0
 def rename_files(self, prefix='My_mp3_'):
     for f in self.files:
         dir = dirname(f)
         fname = basename(f)
         new_fname = prefix + fname
         new_f = path.join(dir, new_fname)
         cmd = 'mv "{f}" "{new_f}"'.format(f=f, new_f=new_f)
         show_var(['cmd'])
         shell(cmd)
Esempio n. 3
0
 def count_authors(self):
     authors = [i['authors'].split(', ') for i in self.papers]
     from efficiency.function import flatten_list
     authors = flatten_list(authors)
     from collections import Counter
     cnt = Counter(authors)
     from efficiency.log import show_var
     show_var(['cnt'])
     import pdb
     pdb.set_trace()
Esempio n. 4
0
def get_total_num_lines_in_large_files(file_list, verbose=True):
    from efficiency.function import shell
    num_lines = []
    for f in sorted(file_list):
        cmd = "wc -l {} | cut -d ' ' -f 1".format(f)
        stdout, stderr = shell(cmd)
        num_line = int(stdout)
        num_lines.append(num_line)
        if verbose:
            from efficiency.log import show_var
            show_var(['cmd', 'num_line'], joiner='\t')
    return sum(num_lines)
Esempio n. 5
0
    def _get_rnn_enc2(self, encoding, length, mask, hx, show_net=False):
        if show_net:
            print("<")
            print("[Net] _get_rnn_enc2")
            show_var(["self.dropout_rnn2_in"])
        # prepare packed_sequence
        encoding = self.dropout_rnn2_in(encoding)

        if length is not None:
            seq_input, hx, rev_order, mask = utils.prepare_rnn_seq(
                encoding, length, hx=hx, masks=mask, batch_first=True)
            seq_output, hn = self.rnn2(seq_input, hx=hx)
            output, hn = utils.recover_rnn_seq(seq_output,
                                               rev_order,
                                               hx=hn,
                                               batch_first=True)
            if show_net:
                print("utils.prepare_rnn_seq()")
                show_var(["self.rnn2"])
        else:
            # output from rnn_out [batch, length, hidden_size]
            output, hn = self.rnn2(encoding, hx=hx)
            if show_net:
                show_var(["self.rnn2"])

        output = self.dropout_rnn2_out(output)
        if show_net:
            show_var(["self.dropout_rnn2_out"])

        return output, hn
Esempio n. 6
0
def check():
    import json

    file = '/home/ubuntu/proj/1908_clickbait/bitly/bitly.json'
    with open(file) as f:
        data = json.load(f)
    show_var(['len(data)', 'list(data.items())[99]'])
    titles = []
    for item in data.values():
        titles.append(item['title'])


    get_most_common_in_list(titles, most_common_n=10)

    bad_titles = {None, '金沙澳门官方网址_', 'Featured Content on Myspace',
                  'Google Maps', 'Games | SYFY WIRE', 'Trending on Offbeat',
                  'Page Not Found', 'YouTube',
                  'QuickSnapper.com domain is for sale | Buy with Epik.com',
                  'Twitter / Account Suspended',
                  'Jason, 443 AI (@JasonInAI) | Twitter',
                  'interactive investor: low cost online trading & investment platform',
                  'Yahoo',
                  'TechRapidly- Blog Provide Tech and Business Tips & Solutions',
                  'Tech & Startup Events In New York - GarysGuide (#1 Resource for NYC Tech)',
                  'The Marmoset Song | quietube', 'Login on Twitter',
                  'Prepare your taste buds...', 'Good night, Posterous',
                  'MSN | Outlook, Office, Skype, Bing, Breaking News, and Latest Videos',
                  'Venture Capitalists Need Money, Too – Gigaom',
                  '502 Bad Gateway', '2008Q4a Home Tour Survey',
                  'Are you human, bot or alien? | mobile9', 'Twitpic',
                  'When Robot Programmers get bored - YouTube',
                  'Account Suspended',
                  'Free Web Hosting - Your Website need to be migrated',
                  '404 Not Found - Web Partner',
                  'Resort | Free Parking | Trump Las Vegas, NV - Booking.com',
                  'TVShowsOnDVD.com - Goodbye',
                  'Get Satisfaction - Customer Communities For Social Support, Social Marketing & Customer Feedback',
                  'Google',
                  'Warning! | There might be a problem with the requested link',
                  'Designer Clothes, Shoes & Bags for Women | SSENSE',
                  'NRKbeta',
                  "Movie Review: Paul Farhi Reviews 'Yoo-Hoo, Mrs. Goldberg' - washingtonpost.com",
                  'ogmaciel.com is coming soon',
                  'Nico Lumma - Hamburg, Deutschland | about.me',
                  'Abattement Fiscal'}
    good_data = {k:v for k,v in data.items() if 'nytimes' in v['long_url']}
    # good_data = {k:v for k,v in data.items() if v['title'] not in bad_titles}
    show_var(['len(good_data)'])
    import pdb;
    pdb.set_trace()
Esempio n. 7
0
    def _get_rnn_enc(self, input, length, mask, hx, show_net=False):
        if show_net:
            print('[Net] _get_rnn_enc')
            show_var(["self.dropout_rnn_in"])

        # apply dropout rnn input
        input = self.dropout_rnn_in(input)

        # use lstm or cnn to encode the sentence at token level
        if self.encoder_mode == 'lstm':
            # prepare packed_sequence
            if length is not None:
                seq_input, hx, rev_order, mask = utils.prepare_rnn_seq(
                    input, length, hx=hx, masks=mask, batch_first=True)
                seq_output, hn = self.sent_rnn(seq_input, hx=hx)
                output, hn = utils.recover_rnn_seq(seq_output,
                                                   rev_order,
                                                   hx=hn,
                                                   batch_first=True)
                if show_net:
                    print("utils.prepare_rnn_seq()")
                    show_var(["self.sent_rnn"])
            else:
                output, hn = self.sent_rnn(input, hx=hx)
                if show_net:
                    show_var(["self.sent_rnn"])
        else:
            _, _, _, mask = utils.prepare_rnn_seq(input,
                                                  length,
                                                  hx=hx,
                                                  masks=mask,
                                                  batch_first=True)
            if length is not None:
                max_len = length.max()
                input = input[:, :max_len, :]

            # first transpose to [batch, hidden_size, length]
            input = input.transpose(1, 2)
            # then send into the first cnn layer
            output = torch.relu(self.sent_conv1d_layer1(input))
            # then second cnn layer
            output = torch.relu(self.sent_conv1d_layer2(output))
            # transpose to [batch, length, hidden_size]
            output = output.transpose(1, 2)

            # output = torch.cat([input.transpose(1, 2), output], dim=2)

            hn = None

        # apply dropout for the output of rnn
        output = self.dropout_rnn_out(output)
        if show_net:
            show_var(["self.dropout_rnn_out"])

        return output, hn
Esempio n. 8
0
def main():
    import os
    import json
    from efficiency.log import fwrite

    data = {}
    dir = '/home/ubuntu/proj/1908_clickbait/bitly'
    file_filter = lambda f: f.startswith('bitly_') and f.endswith('.json')

    fm = FileManager(dir=dir, file_filter=file_filter)
    print(json.dumps(fm.files, indent=4))
    for file in fm.files:
        with open(file) as f: content = json.load(f)
        data.update(content)
        show_var(
            ["file", "len(content)", "len(data)", "list(content.keys())[:3]"])

    data = dict(sorted(data.items()))
    fwrite(json.dumps(data, indent=4), os.path.join(dir, 'bitly.json'))
Esempio n. 9
0
    def _get_rnn_output(self,
                        input_word_orig,
                        input_word,
                        input_char,
                        mask=None,
                        length=None,
                        hx=None,
                        show_net=False):

        input, length = self._get_word_enc(input_word_orig,
                                           input_word,
                                           input_char,
                                           mask=mask,
                                           length=length,
                                           show_net=show_net)

        output, hn = self._get_rnn_enc(input,
                                       length,
                                       mask,
                                       hx,
                                       show_net=show_net)

        if self.tag_space:
            # [batch, length, tag_space]
            output = self.dropout_tag(F.elu(self.lstm_to_tag_space(output)))
            if show_net:
                print("[Net] to_tag")
                show_var(["self.lstm_to_tag_space"])
                show_var(["F.elu"])
                show_var(["self.dropout_tag"])

        return output, hn, mask, length
Esempio n. 10
0
def example():
    soup = BeautifulSoup(features="html.parser")
    ls = []
    ls += ["Hello world "]

    tag = soup_tag(soup, "red", 'font', style="border:2px solid Tomato;")
    ls += [tag]

    tag = soup_tag(soup, "maggie", "sub")
    ls += [tag]

    tag = soup_tag(soup, "important things", "b")
    ls += [tag]

    ls += [soup_tag(soup, None, "br")]
    ls += [" are transient."]
    ls += [soup.new_tag("br")]
    ls += ["end."]

    soup = soupls2html(soup, ls)

    show_var(["soup"])  # , "soup.prettify()"

    return soup
Esempio n. 11
0
    def forward(self, q, k, v, attn_mask=None, show_net=False):
        assert len(q.size()) == 3

        if self.ff_layers == 1:
            q_out, k_out = self._fc(self.linear1, q, k, use_elu=False)
            if show_net:
                show_var(["self.linear1"])

        elif self.ff_layers == 2:
            q_out, k_out = self._fc(self.linear1, q, k)
            q_out, k_out = self._fc(self.linear2, q_out, k_out, use_elu=False)
            if show_net:
                show_var(["self.linear1", "self.linear2"])


        if show_net:
            print("bmm --> self.dropout")
            show_var(["self.comb_att_n_init_adj"])

        attn = torch.bmm(q_out, k_out.transpose(1, 2))  # / self.temper

        # cos sim needs normalization

        if attn_mask is not None:
            assert attn_mask.size() == attn.size(), \
                'Attention mask shape {} mismatch ' \
                'with Attention logit tensor shape ' \
                '{}.'.format(attn_mask.size(), attn.size())

            attn.masked_fill_(attn_mask, -float('inf'))

        if attn_mask is not None:
            attn.data.masked_fill_(attn_mask, 0)  # convert NaN to 0

        attn = self.dropout(attn)
        import pdb; pdb.set_trace()
        output = self.comb_att_n_init_adj(attn, v)

        return output, attn
Esempio n. 12
0
cnt = counter.values()
radius = np.array(list(cnt)) / 2
area = radius**2 * np.pi
# Fixing random state for reproducibility
np.random.seed(19680801)

colors = np.random.rand(len(counter))

plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.xlabel('# Triples of the Input Graph in WebNLG 2017')
plt.ylabel('# Variations of Generated Text')
plt.title("Text Diversity Generated by CycleCVAE")

x2argmax_y = {}
for (x, y), cnt in counter.items():
    if cnt < 10:
        continue
    if x in x2argmax_y:
        prev_y, prev_cnt = x2argmax_y[x]
        if cnt < prev_cnt:
            continue
    x2argmax_y[x] = (y, cnt)
x = list(x2argmax_y.keys())
y, cnt = zip(*x2argmax_y.values())
show_var(['x', 'y'])
import pdb
pdb.set_trace()
plt.plot(x, y)

plt.show()
Esempio n. 13
0
    stdout, stderr = shell(cmd)

    stdout = stdout.strip()
    mem = int(stdout) if stdout != b'' else None

    return mem


def debug(what_to_debug=''):
    if what_to_debug:
        print("[Info] start debugging {}".format(what_to_debug))

    import pdb
    pdb.set_trace()

    # you can use "c" for continue, "p variable", "p locals", "n" for next
    # you can use "!a += 1" for changes of variables
    # you can use "import code; code.interact(local=locals)" to iPython with
    # all variables


if __name__ == "__main__":
    a = "something"
    b = 1
    show_var(["a", "b"], joiner=', ')

    debug("show_var")

    show_var(["c"])
def main():
    # Arguments parser
    parser = argparse.ArgumentParser(description='Tuning with DNN Model for NER')
    # Model Hyperparameters
    parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', default='LSTM')
    parser.add_argument('--encoder_mode', choices=['cnn', 'lstm'], help='Encoder type for sentence encoding',
                        default='lstm')
    parser.add_argument('--char_method', choices=['cnn', 'lstm'], help='Method to create character-level embeddings',
                        required=True)
    parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN for sentence level')
    parser.add_argument('--char_hidden_size', type=int, default=30, help='Output character-level embeddings size')
    parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings')
    parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space')
    parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN')
    parser.add_argument('--dropout', choices=['std', 'gcn'], help='Dropout method',
                        default='gcn')
    parser.add_argument('--p_em', type=float, default=0.33, help='dropout rate for input embeddings')
    parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input of RNN model')
    parser.add_argument('--p_rnn', nargs=3, type=float, required=True, help='dropout rate for RNN')
    parser.add_argument('--p_tag', type=float, default=0.33, help='dropout rate for output layer')
    parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF')

    parser.add_argument('--adj_attn', choices=['cossim', 'flex_cossim', 'flex_cossim2', 'concat', '', 'multihead'],
                        default='')

    # Data loading and storing params
    parser.add_argument('--embedding_dict', help='path for embedding dict')
    parser.add_argument('--dataset_name', type=str, default='alexa', help='Which dataset to use')
    parser.add_argument('--train', type=str, required=True, help='Path of train set')
    parser.add_argument('--dev', type=str, required=True, help='Path of dev set')
    parser.add_argument('--test', type=str, required=True, help='Path of test set')
    parser.add_argument('--results_folder', type=str, default='results', help='The folder to store results')
    parser.add_argument('--alphabets_folder', type=str, default='data/alphabets',
                        help='The folder to store alphabets files')

    # Training parameters
    parser.add_argument('--cuda', action='store_true', help='whether using GPU')
    parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch')
    parser.add_argument('--learning_rate', type=float, default=0.001, help='Base learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.95, help='Decay rate of learning rate')
    parser.add_argument('--schedule', type=int, default=3, help='schedule for learning rate decay')
    parser.add_argument('--gamma', type=float, default=0.0, help='weight for l2 regularization')
    parser.add_argument('--max_norm', type=float, default=1., help='Max norm for gradients')
    parser.add_argument('--gpu_id', type=int, nargs='+', required=True, help='which gpu to use for training')

    parser.add_argument('--learning_rate_gcn', type=float, default=5e-4, help='Base learning rate')
    parser.add_argument('--gcn_warmup', type=int, default=200, help='Base learning rate')
    parser.add_argument('--pretrain_lstm', type=float, default=10, help='Base learning rate')

    parser.add_argument('--adj_loss_lambda', type=float, default=0.)
    parser.add_argument('--lambda1', type=float, default=1.)
    parser.add_argument('--lambda2', type=float, default=0.)
    parser.add_argument('--seed', type=int, default=None)

    # Misc
    parser.add_argument('--embedding', choices=['glove', 'senna', 'alexa'], help='Embedding for words', required=True)
    parser.add_argument('--restore', action='store_true', help='whether restore from stored parameters')
    parser.add_argument('--save_checkpoint', type=str, default='', help='the path to save the model')
    parser.add_argument('--o_tag', type=str, default='O', help='The default tag for outside tag')
    parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK')
    parser.add_argument('--evaluate_raw_format', action='store_true', help='The tagging format for evaluation')


    parser.add_argument('--eval_type', type=str, default="micro_f1",choices=['micro_f1', 'acc'])
    parser.add_argument('--show_network', action='store_true', help='whether to display the network structure')
    parser.add_argument('--smooth', action='store_true', help='whether to skip all pdb break points')

    parser.add_argument('--uid', type=str, default='temp')
    parser.add_argument('--misc', type=str, default='')

    args = parser.parse_args()
    show_var(['args'])

    uid = args.uid
    results_folder = args.results_folder
    dataset_name = args.dataset_name
    use_tensorboard = True

    save_dset_dir = '{}../dset/{}/graph'.format(results_folder, dataset_name)
    result_file_path = '{}/{dataset}_{uid}_result'.format(results_folder, dataset=dataset_name, uid=uid)

    save_loss_path = '{}/{dataset}_{uid}_loss'.format(results_folder, dataset=dataset_name, uid=uid)
    save_lr_path = '{}/{dataset}_{uid}_lr'.format(results_folder, dataset=dataset_name, uid='temp')
    save_tb_path = '{}/tensorboard/'.format(results_folder)

    logger = get_logger("NERCRF")
    loss_recorder = LossRecorder(uid=uid)
    record = TensorboardLossRecord(use_tensorboard, save_tb_path, uid=uid)

    # rename the parameters
    mode = args.mode
    encoder_mode = args.encoder_mode
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    char_hidden_size = args.char_hidden_size
    char_method = args.char_method
    learning_rate = args.learning_rate
    momentum = 0.9
    decay_rate = args.decay_rate
    gamma = args.gamma
    max_norm = args.max_norm
    schedule = args.schedule
    dropout = args.dropout
    p_em = args.p_em
    p_rnn = tuple(args.p_rnn)
    p_in = args.p_in
    p_tag = args.p_tag
    unk_replace = args.unk_replace
    bigram = args.bigram
    embedding = args.embedding
    embedding_path = args.embedding_dict
    evaluate_raw_format = args.evaluate_raw_format
    o_tag = args.o_tag
    restore = args.restore
    save_checkpoint = args.save_checkpoint
    alphabets_folder = args.alphabets_folder
    use_elmo = False
    p_em_vec = 0.
    graph_model = 'gnn'
    coref_edge_filt = ''

    learning_rate_gcn = args.learning_rate_gcn
    gcn_warmup = args.gcn_warmup
    pretrain_lstm = args.pretrain_lstm

    adj_loss_lambda = args.adj_loss_lambda
    lambda1 = args.lambda1
    lambda2 = args.lambda2

    if args.smooth:
        import pdb
        pdb.set_trace = lambda: None

    misc = "{}".format(str(args.misc))

    score_file = "{}/{dataset}_{uid}_score".format(results_folder, dataset=dataset_name, uid=uid)

    for folder in [results_folder, alphabets_folder, save_dset_dir]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    def set_seed(seed):
        if not seed:
            seed = int(show_time())
        print("[Info] seed set to: {}".format(seed))
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

    set_seed(args.seed)

    embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path)

    logger.info("Creating Alphabets")
    word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets(
        "{}/{}/".format(alphabets_folder, dataset_name), train_path, data_paths=[dev_path, test_path],
        embedd_dict=embedd_dict, max_vocabulary_size=50000)

    logger.info("Word Alphabet Size: %d" % word_alphabet.size())
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())

    logger.info("Reading Data")
    device = torch.device('cuda') if args.cuda else torch.device('cpu')
    print(device)

    data_train = conll03_data.read_data(train_path, word_alphabet, char_alphabet,
                                        ner_alphabet,
                                        graph_model, batch_size, ori_order=False,
                                        total_batch="{}x".format(num_epochs + 1),
                                        unk_replace=unk_replace, device=device,
                                        save_path=save_dset_dir + '/train', coref_edge_filt=coref_edge_filt
                                        )
    # , shuffle=True,
    num_data = data_train.data_len
    num_labels = ner_alphabet.size()
    graph_types = data_train.meta_info['graph_types']

    data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet,
                                      ner_alphabet,
                                      graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device,
                                      save_path=save_dset_dir + '/dev',
                                      coref_edge_filt=coref_edge_filt)

    data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet,
                                       ner_alphabet,
                                       graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device,
                                       save_path=save_dset_dir + '/test',
                                       coref_edge_filt=coref_edge_filt)

    writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / embedd_dim)
        table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32)
        table[conll03_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
        oov = 0
        for word, index in word_alphabet.items():
            if word in embedd_dict:
                embedding = embedd_dict[word]
            elif word.lower() in embedd_dict:
                embedding = embedd_dict[word.lower()]
            else:
                embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32)
                oov += 1
            table[index, :] = embedding
        print('oov: %d' % oov)
        return torch.from_numpy(table)

    word_table = construct_word_embedding_table()
    logger.info("constructing network...")

    char_dim = args.char_dim
    window = 3
    num_layers = args.num_layers
    tag_space = args.tag_space
    initializer = nn.init.xavier_uniform_

    p_gcn = [0.5, 0.5]

    d_graph = 256
    d_out = 256
    d_inner_hid = 128
    d_k = 32
    d_v = 32
    n_head = 4
    n_gcn_layer = 1

    p_rnn2 = [0.0, 0.5, 0.5]

    adj_attn = args.adj_attn
    mask_singles = True
    post_lstm = 1
    position_enc_mode = 'none'

    adj_memory = False

    if dropout == 'gcn':
        network = BiRecurrentConvGraphCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(),
                                          char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers,
                                          num_labels,
                                          graph_model, n_head, d_graph, d_inner_hid, d_k, d_v, p_gcn, n_gcn_layer,
                                          d_out, post_lstm=post_lstm, mask_singles=mask_singles,
                                          position_enc_mode=position_enc_mode, adj_attn=adj_attn,
                                          adj_loss_lambda=adj_loss_lambda,
                                          tag_space=tag_space, embedd_word=word_table,
                                          use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_tag=p_tag,
                                          p_rnn=p_rnn, p_rnn2=p_rnn2,
                                          bigram=bigram, initializer=initializer)

    elif dropout == 'std':
        network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size,
                                     window, mode, encoder_mode, hidden_size, num_layers, num_labels,
                                     tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec,
                                     p_em=p_em, p_in=p_in, p_tag=p_tag, p_rnn=p_rnn, bigram=bigram,
                                     initializer=initializer)

    # whether restore from trained model
    if restore:
        network.load_state_dict(torch.load(save_checkpoint + '_best.pth'))  # load trained model

    logger.info("cuda()ing network...")

    network = network.to(device)

    if dataset_name == 'conll03' and data_dev.data_len > 26:
        sample = data_dev.pad_batch(data_dev.dataset[25:26])
    else:
        sample = data_dev.pad_batch(data_dev.dataset[:1])
    plot_att_change(sample, network, record, save_tb_path + 'att/', uid='temp', epoch=0, device=device,
                    word_alphabet=word_alphabet, show_net=args.show_network,
                    graph_types=data_train.meta_info['graph_types'])

    logger.info("finished cuda()ing network...")

    lr = learning_rate
    lr_gcn = learning_rate_gcn
    optim = Optimizer('sgd', 'adam', network, dropout, lr=learning_rate,
                      lr_gcn=learning_rate_gcn,
                      wd=0., wd_gcn=0., momentum=momentum, lr_decay=decay_rate, schedule=schedule,
                      gcn_warmup=gcn_warmup,
                      pretrain_lstm=pretrain_lstm)
    nn.utils.clip_grad_norm_(network.parameters(), max_norm)
    logger.info(
        "Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \
        (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space,
         'bigram' if bigram else 'unigram'))
    logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % (
        gamma, num_data, batch_size, unk_replace))
    logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_tag, p_rnn))

    num_batches = num_data // batch_size + 1
    dev_f1 = 0.0
    dev_acc = 0.0
    dev_precision = 0.0
    dev_recall = 0.0
    test_f1 = 0.0
    test_acc = 0.0
    test_precision = 0.0
    test_recall = 0.0
    best_epoch = 0
    best_test_f1 = 0.0
    best_test_acc = 0.0
    best_test_precision = 0.0
    best_test_recall = 0.0
    best_test_epoch = 0.0

    loss_recorder.start(save_loss_path, mode='w', misc=misc)
    fwrite('', save_lr_path)
    fwrite(json.dumps(vars(args)) + '\n', result_file_path)

    for epoch in range(1, num_epochs + 1):
        show_var(['misc'])

        lr_state = 'Epoch %d (uid=%s, lr=%.2E, lr_gcn=%.2E, decay rate=%.4f): ' % (
            epoch, uid, Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), decay_rate)
        print(lr_state)
        fwrite(lr_state[:-2] + '\n', save_lr_path, mode='a')

        train_err = 0.
        train_err2 = 0.
        train_total = 0.

        start_time = time.time()
        num_back = 0
        network.train()
        for batch_i in range(1, num_batches + 1):

            batch_doc = data_train.next()
            char, word, posi, labels, feats, adjs, words_en = [batch_doc[i] for i in [
                "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]]

            sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent(
                word, char, labels)

            optim.zero_grad()

            adjs_into_model = adjs if adj_memory else adjs.clone()

            loss, (ner_loss, adj_loss) = network.loss(None, word, char, adjs_into_model, labels,
                                                      graph_types=graph_types, lambda1=lambda1, lambda2=lambda2)

            # loss = network.loss(_, sent_word, sent_char, sent_labels, mask=sent_mask)
            loss.backward()
            optim.step()

            with torch.no_grad():
                num_inst = sent_mask.size(0)
                train_err += ner_loss * num_inst
                train_err2 += adj_loss * num_inst
                train_total += num_inst

            time_ave = (time.time() - start_time) / batch_i
            time_left = (num_batches - batch_i) * time_ave

            # update log
            if batch_i % 20 == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = 'train: %d/%d loss1: %.4f, loss2: %.4f, time left (estimated): %.2fs' % (
                    batch_i, num_batches, train_err / train_total, train_err2 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

            optim.update(epoch, batch_i, num_batches, network)

        sys.stdout.write("\b" * num_back)
        sys.stdout.write(" " * num_back)
        sys.stdout.write("\b" * num_back)
        print('train: %d loss: %.4f, loss2: %.4f, time: %.2fs' % (
            num_batches, train_err / train_total, train_err2 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        with torch.no_grad():
            network.eval()
            tmp_filename = "{}/{dataset}_{uid}_output_dev".format(results_folder, dataset=dataset_name, uid=uid)

            writer.start(tmp_filename)

            for batch in data_dev:
                char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [
                    "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]]
                sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent(
                    word, char, labels)

                preds, _ = network.decode(
                    None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS,
                    graph_types=graph_types)
                # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask,
                #                           leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)
                writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(),
                             sent_length.cpu().numpy())
            writer.close()


            if args.eval_type == "acc":
                acc, precision, recall, f1 =evaluate_tokenacc(tmp_filename)
                f1 = acc
            else:
                acc, precision, recall, f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag)

            print('dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1))

            # plot loss and attention
            record.plot_loss(epoch, train_err / train_total, f1)

            plot_att_change(sample, network, record, save_tb_path + 'att/', uid="{}_{:03d}".format(uid, epoch),
                            epoch=epoch, device=device,
                            word_alphabet=word_alphabet, show_net=False, graph_types=graph_types)

            if dev_f1 < f1:
                dev_f1 = f1
                dev_acc = acc
                dev_precision = precision
                dev_recall = recall
                best_epoch = epoch

                # evaluate on test data when better performance detected
                tmp_filename = "{}/{dataset}_{uid}_output_test".format(results_folder, dataset=dataset_name,
                                                                       uid=uid)
                writer.start(tmp_filename)

                for batch in data_test:
                    char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [
                        "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]]
                    sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent(
                        word, char, labels)

                    preds, _ = network.decode(
                        None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS,
                        graph_types=graph_types)
                    # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask,
                    #                           leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS)

                    writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(),
                                 sent_length.cpu().numpy())
                writer.close()

                if args.eval_type == "acc":
                    test_acc, test_precision, test_recall, test_f1 = evaluate_tokenacc(tmp_filename)
                    test_f1 = test_acc
                else:
                    test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename, score_file, evaluate_raw_format,																		  o_tag)

                if best_test_f1 < test_f1:
                    best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1
                    best_test_epoch = epoch

                # save the model parameters
                if save_checkpoint:
                    torch.save(network.state_dict(), save_checkpoint + '_best.pth')

            print("best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
                dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
            print("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
                test_acc, test_precision, test_recall, test_f1, best_epoch))
            print("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (
                best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch))

        # optim.update(epoch, 1, num_batches, network)
        loss_recorder.write(epoch, train_err / train_total, train_err2 / train_total,
                            Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), f1, best_test_f1, test_f1)
    with open(result_file_path, 'a') as ofile:
        ofile.write("best dev  acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % (
            dev_acc, dev_precision, dev_recall, dev_f1, best_epoch))
        ofile.write("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % (
            test_acc, test_precision, test_recall, test_f1, best_epoch))
        ofile.write("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n" % (
            best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch))

    record.close()

    print('Training finished!')
Esempio n. 15
0
    def _get_gcn_output(self,
                        input_word_orig,
                        input_word,
                        input_char,
                        adjs,
                        target=None,
                        mask=None,
                        length=None,
                        hx=None,
                        leading_symbolic=0,
                        return_edge=False,
                        show_net=False,
                        graph_types=['coref']):
        if "wonderful" in graph_types:
            gold_adj = adjs[:, -1, :].clone()
            gnn_adjs = adjs[:, :-1, :]

        mask_singles = self.mask_singles

        assert len(input_word.size()) == 3, "the input is not document level"
        # input_word is the packed sents [n_sent, sent_len]
        input_word, input_char, target, sent_mask, length, doc_n_sent = self._doc2sent(
            input_word, input_char, target, show_net=show_net)

        # input: [n_sent, sent_len, enc_dim]
        input, length = self._get_word_enc(input_word_orig,
                                           input_word,
                                           input_char,
                                           mask=sent_mask,
                                           length=length,
                                           show_net=show_net)

        # output from rnn [n_sent, sent_len, enc_dim]
        sent_output, hn = self._get_rnn_enc(input,
                                            length,
                                            sent_mask,
                                            hx,
                                            show_net=show_net)

        # flatten sents to words [batch, n_word, dim]
        # mask for packed_doc [batch, n_word]
        output, doc_word_mask = self._sent2word(sent_output,
                                                sent_mask,
                                                doc_n_sent,
                                                show_net=show_net)

        # enc for non-repetitive words

        if mask_singles:
            if show_net:
                print("[Net] Block singles from here.")

            coref_ix = 0
            # single is 1, repetitive word is 0
            single_mask = gnn_adjs[:,
                                   coref_ix].sum(-1,
                                                 keepdim=True).eq(0).float()
            sent_single_mask = self._word2sent(single_mask,
                                               doc_word_mask,
                                               length,
                                               sent_mask,
                                               show_net=show_net)
            singles = sent_output * sent_single_mask.expand_as(sent_output)
            if self.tag_space:
                # [batch, length, tag_space]
                singles = self.dropout_tag(
                    F.elu(self.lstm_to_tag_space(singles)))
                if show_net:
                    print("singles -> self.lstm_to_tag_space")
                singles = singles * sent_single_mask.expand_as(singles)

            # [batch, n_word, d_graph]
            output = output * (1 - single_mask).expand_as(output)

        # go thru gcn [batch, n_word, d_graph]
        h_gcn, *_ = self.gcn(output,
                             gnn_adjs,
                             doc_word_mask,
                             return_edge=return_edge,
                             show_net=show_net)

        output = self._word2sent(h_gcn,
                                 doc_word_mask,
                                 length,
                                 sent_mask,
                                 show_net=show_net)

        if self.post_lstm:
            # output from rnn [n_sent, sent_len, enc_dim]
            output, hn = self._get_rnn_enc2(output,
                                            length,
                                            sent_mask,
                                            hx,
                                            show_net=show_net)

        # output from rnn_out [batch, length, tag_space]
        output = self.dropout_tag(F.elu(self.to_tag_space(output)))
        if show_net:
            print("<")
            print("[Net] to_tag")
            show_var(["self.to_tag_space"])
            show_var(["F.elu"])
            show_var(["self.dropout_tag"])
            print(">")

        if mask_singles:
            output = output * (1 - sent_single_mask).expand_as(output)
            output = output + singles  # repetive word enc + single word enc
            if show_net:
                print("[Net] output + singles")

        if length is not None:
            max_len = length.max()
            target = target[:, :max_len]

        adj_loss = self._adj_loss(
            gnn_adjs[:, 0, :], gold_adj) if "wonderful" in graph_types else 0
        return output, target, sent_mask, length, adj_loss
Esempio n. 16
0
    def _get_word_enc(self,
                      input_word_orig,
                      input_word,
                      input_char,
                      mask=None,
                      length=None,
                      show_net=False):
        # hack length from mask
        # we do not hack mask from length for special reasons.
        # Thus, always provide mask if it is necessary.
        if length is None and mask is not None:
            length = mask.sum(dim=1).long()

        if self.p_em_vec:
            word = embedded_dropout(
                self.word_embedd,
                input_word,
                dropout=self.p_em_vec if self.training else 0)
            char = embedded_dropout(
                self.char_embedd,
                input_char,
                dropout=self.p_em_vec if self.training else 0)
        else:
            # [batch, length, word_dim]
            word = self.word_embedd(input_word)
            # [batch, length, char_length, char_dim]
            char = self.char_embedd(input_char)

        char_size = char.size()

        if self.char_method == 'cnn':
            # first transform to [batch *length, char_length, char_dim]
            # then transpose to [batch * length, char_dim, char_length]
            char = char.view(char_size[0] * char_size[1], char_size[2],
                             char_size[3]).transpose(1, 2)
            # put into cnn [batch*length, char_filters, char_length]
            # then put into maxpooling [batch * length, char_filters]
            char, _ = self.char_conv1d(char).max(dim=2)
            # reshape to [batch, length, char_filters]
            char = torch.tanh(char).view(char_size[0], char_size[1], -1)
        else:
            # first transform to [batch *length, char_length, char_dim]
            char = char.view(char_size[0] * char_size[1], char_size[2],
                             char_size[3])
            # put into rnn module and get the last hidden state
            _, (char, _) = self.char_rnn(char)
            # reshape to [batch, length, char_hidden_size]
            char = char.view(char_size[0], char_size[1], -1)

        # apply dropout word on input
        word = self.dropout_em(word)
        char = self.dropout_em(char)

        # concatenate word and char [batch, length, word_dim+char_filter]
        # choose whether to concatenate the ELMO embeddings
        if self.elmo:
            elmo_embeddings = self.elmo(batch_to_ids(input_word_orig))
            # TODO: the coefficient for elmo needs to be tuned
            input = torch.cat([word, char, 0.1 * elmo_embeddings], dim=2)
        else:
            input = torch.cat([word, char], dim=2)

        if show_net:
            print("[Net] _get_word_enc: torch.cat([word {}, char {}]".format(
                word.shape[-1], char.shape[-1]))
            show_var(["self.dropout_em"])
        return input, length
Esempio n. 17
0
radius = np.array(list(cnt)) / 2
area = radius**2 * np.pi
# Fixing random state for reproducibility
np.random.seed(19680801)

colors = np.random.rand(len(counter))

plt.scatter(x, y, s=area, c=colors, alpha=0.5)
plt.xlabel("# Triples of the Input Graph in WebNLG 2017")
plt.ylabel("# Variations of Generated Text")
plt.title("Text Diversity Generated by CycleCVAE")

x2argmax_y = {}
for (x, y), cnt in counter.items():
    if cnt < 10:
        continue
    if x in x2argmax_y:
        prev_y, prev_cnt = x2argmax_y[x]
        if cnt < prev_cnt:
            continue
    x2argmax_y[x] = (y, cnt)
x = list(x2argmax_y.keys())
y, cnt = zip(*x2argmax_y.values())
show_var(["x", "y"])
import pdb

pdb.set_trace()
plt.plot(x, y)

plt.show()
Esempio n. 18
0
def main():
    raw_text = 'Hello, world. Here are two people with M.A. degrees from UT Austin. This is Mr. Mike.'
    nlp = NLP()
    sentences = nlp.sent_tokenize(raw_text)
    words = nlp.word_tokenize(sentences[0], lower=True)
    show_var(['sentences', 'words'])
Esempio n. 19
0
    def forward(self,
                h_gcn,
                adjs,
                doc_word_mask,
                return_edge=False,
                show_net=False):

        gcn_model = self.gcn_model

        posi = [sent.nonzero().view(-1) for sent in doc_word_mask]
        posi = torch.stack(posi) + 1

        batch_size, n_node = h_gcn.size()[:2]

        slf_attn_pad_mask = get_attn_padding_mask(doc_word_mask, doc_word_mask)

        if len(self.gcn_layers) > 0:
            if show_net:
                print("<")
                print("[Net_gcn] gcn prep:")
                show_var([
                    "self.to_graph", "F.elu", "self.dropout_gcn_in",
                    "self.position_enc"
                ])

            h_gcn = self.dropout_gcn_in(F.elu(self.to_graph(h_gcn)))

            # h_gcn = torch.cat((init_enc, pos_enc), dim=2)
            h_gcn = h_gcn + self.position_enc(posi, h_gcn)

            if self.adj_attn_type:
                if show_net:
                    show_var(["self.adj_attn_type"])
                old_adjs = adjs.clone()
                coref_adj = old_adjs[:, 0]
                new_adj, attns = self.adj_attn(h_gcn,
                                               h_gcn,
                                               coref_adj,
                                               show_net=show_net)
                adjs[:, 0] = new_adj

        if return_edge:
            edge_weights = []
        adjs = adjs if gcn_model in ['gnn', 'gnn1'] else adjs.squeeze(1)

        slf_attn_adj_mask = get_attn_adj_mask(adjs)

        opts = (adjs, doc_word_mask) if gcn_model in ['gnn', 'gnn1'] \
            else (adjs,) if gcn_model == 'gnnattn' \
            else (slf_attn_pad_mask,) if gcn_model == 'transformer' \
            else (slf_attn_adj_mask,) if gcn_model == 'transformer_graph' \
            else None

        for layer_i, layer in enumerate(self.gcn_layers):
            if show_net:
                print("gcn [{m}] layer {i}: {l}".format(m=gcn_model,
                                                        i=layer_i,
                                                        l=layer))
                print(">gcn")

            h_gcn, edge_weig = layer(h_gcn, *opts)
            if return_edge:
                edge_weights += [edge_weig]
        if self.ga_heads > 0:
            h_gcn = self.ga_layer(h_gcn, doc_word_mask)
            # if return_edge:
            #     edge_weights += [edge_weig]

        h_gcn = h_gcn.view(batch_size, n_node, -1)

        if return_edge:
            edge_weights = torch.stack(edge_weights)

        if return_edge:
            return h_gcn, torch.stack(edge_weights)
        else:
            return h_gcn,