def __init__(self):
        ##### Reset tensorflow variables
        # imp.reload(tf)
        tf.reset_default_graph()
        #### Embedding Model #####
        set_gpus(0)
        self.elmo = hub.Module("https://tfhub.dev/google/elmo/1",
                               trainable=True)
        self.sentences = tf.placeholder('string', shape=(None, None))
        self.text_len = tf.placeholder('int32', shape=(None))

        lm_embeddings = self.elmo(inputs={
            "tokens": self.sentences,
            "sequence_len": self.text_len
        },
                                  signature="tokens",
                                  as_dict=True)

        word_emb = tf.expand_dims(lm_embeddings["word_emb"],
                                  3)  # [B, slen, 512]
        self.lm_emb_op = tf.concat(
            [
                tf.concat([word_emb, word_emb], 2),  # [B, slen, 1024, 1]
                tf.expand_dims(lm_embeddings["lstm_outputs1"], 3),
                tf.expand_dims(lm_embeddings["lstm_outputs2"], 3)
            ],
            3)  # [B, slen, 1024, 3]
Beispiel #2
0
	def _load_model(self,experiment):
		util.set_gpus()
		print "Running experiment: {}.".format(experiment)
		config = util.get_config("experiments.conf")[experiment]
		config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], experiment))
		
		util.print_config(config)
		model = cm.CorefModel(config)
		
		saver = tf.train.Saver()
		log_dir = config["log_dir"]
		
		with tf.Session() as session:
			checkpoint_path = os.path.join(log_dir, "model.max.ckpt")
			saver.restore(session, checkpoint_path)
			self.model=model
			self.session=session
Beispiel #3
0
    def __init__(self, bert_model="bert_large", num_gpus=0):
        # BERT Tokenizer
        indent = "========"
        proj_path = os.path.abspath(os.path.dirname(__file__)).split("src")[0]

        print(indent + " loading BERT Tokenizer " + indent)
        sys.path.insert(1, proj_path + 'models/coref/')
        # from models.coref.bert import tokenization
        from bert import tokenization
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=proj_path + 'models/' + bert_model + '/vocab.txt',
            do_lower_case=False)

        # load NER model
        print(indent + " loading Flair NER model " + indent)
        self.ner_tagger = SequenceTagger.load('ner')

        # load spacy dependency parser
        print(indent + " loading Spacy Dependency Parser ===" + indent)
        self.dep_parser = spacy.load("en_core_web_sm")

        # initialise coref environment
        print(indent + " Initialising coref environment " + indent)
        import util
        os.environ['data_dir'] = proj_path + "models/"
        os.system("export data_dir")

        util.set_gpus(num_gpus)
        print("Running experiment: {}".format(bert_model))
        config = pyhocon.ConfigFactory.parse_file(
            proj_path + "models/coref/experiments.conf")[bert_model]

        config["log_dir"] = util.mkdirs(
            os.path.join(config["log_root"], bert_model))
        print(pyhocon.HOCONConverter.convert(config, "hocon"))
        log_dir = config["log_dir"]

        self.model = util.get_model(config)
        self.session = tf.Session()
        self.model.restore(self.session)

        print("===========================")
        print("=== Initialisation Done ===")
        print("===========================")
Beispiel #4
0
import os
import sys
import time
import json
import numpy as np

import tensorflow as tf

import coref_model as cm
import inference_utils
import input_utils
import srl_model as srl
import util

if __name__ == "__main__":
    util.set_gpus()

    name = sys.argv[1]
    output_filename = sys.argv[2]

    print("Running experiment: {}.".format(name))
    config = util.get_config("experiments.conf")[name]
    config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))

    util.print_config(config)
    #model = cm.CorefModel(config)
    model = srl.SRLModel(config)

    model.load_eval_data()

    saver = tf.train.Saver()
Beispiel #5
0
import coref_model as cm
import util

if __name__ == "__main__":
    if len(sys.argv) > 1:
        name = sys.argv[1]
    else:
        name = os.environ["EXP"]
    config = util.get_config("experiments.conf")[name]
    report_frequency = config["report_frequency"]

    config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))
    util.print_config(config)

    if "GPU" in os.environ:
        util.set_gpus(int(os.environ["GPU"]))
    else:
        util.set_gpus()

    model = cm.CorefModel(config)
    saver = tf.train.Saver()
    init_op = tf.global_variables_initializer()

    log_dir = config["log_dir"]
    writer = tf.summary.FileWriter(os.path.join(log_dir, "train"),
                                   flush_secs=20)

    # Create a "supervisor", which oversees the training process.
    sv = tf.train.Supervisor(logdir=log_dir,
                             init_op=init_op,
                             saver=saver,
import os
import sys
import time

import tensorflow as tf
import coref_model as cm
import util

if __name__ == "__main__":
    config = util.initialize_from_env()
    task_index = int(os.environ["TASK"])

    report_frequency = config["report_frequency"]
    cluster_config = config["cluster"]

    util.set_gpus(cluster_config["gpus"][task_index])

    cluster = tf.train.ClusterSpec(cluster_config["addresses"])
    server = tf.train.Server(cluster, job_name="worker", task_index=task_index)

    # Assigns ops to the local worker by default.
    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % task_index,
                cluster=cluster)):
        model = cm.CorefModel(config)
        saver = tf.train.Saver()
        init_op = tf.global_variables_initializer()

    log_dir = config["log_dir"]
    writer = tf.summary.FileWriter(os.path.join(log_dir,
Beispiel #7
0
            name)

    config = util.get_config("experiments.conf")[name]
    config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))

    config["batch_size"] = -1
    config["max_tokens_per_batch"] = -1

    # Use dev lm, if provided.
    if config["lm_path"] and "lm_path_dev" in config and config["lm_path_dev"]:
        config["lm_path"] = config["lm_path_dev"]

    util.print_config(config)

    if len(sys.argv) > 3 and sys.argv[2] == '-gpu':
        util.set_gpus(sys.argv[3])

    data = LSGNData(config)
    model = SRLModel(data, config)
    evaluator = LSGNEvaluator(config)

    variables_to_restore = []
    for var in tf.global_variables():
        print var.name
        if "module/" not in var.name:
            variables_to_restore.append(var)

    saver = tf.train.Saver(variables_to_restore)
    log_dir = config["log_dir"]

    with tf.Session() as session:
Beispiel #8
0
    if len(sys.argv) > 1:
        name = sys.argv[1]
    else:
        name = os.environ["EXP"]
    config = util.get_config("experiments.conf")[name]
    print('config')
    report_frequency = config["report_frequency"]

    config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))
    util.print_config(config)
    print((os.environ))
    # if "GPU" in os.environ:
    #   gpus = [int(g) for g in os.environ["GPU"].split(",")]
    #   util.set_gpus(*gpus)
    # else:
    util.set_gpus(0)

    data = LSGNData(config)
    model = SRLModel(data, config)
    saver = tf.train.Saver()
    init_op = tf.global_variables_initializer()

    log_dir = config["log_dir"]
    assert not ("final" in name
                )  # Make sure we don't override a finalized checkpoint.
    writer = tf.summary.FileWriter(log_dir, flush_secs=20)

    # Create a "supervisor", which oversees the training process.
    sv = tf.train.Supervisor(logdir=log_dir,
                             init_op=init_op,
                             saver=saver,
Beispiel #9
0
                                          sentences: sent,
                                          text_len: slen
                                    }
                              )
                              sentence_id = docid + '_' + str(j)
                              ds = fout.create_dataset(
                                    sentence_id, lm_emb.shape[1:], dtype='float32',
                                    data=lm_emb[0, :, :, :]  # [slen, lm_size, lm_layers]
                              )
                  fout.close  


#### Model #####


set_gpus(0)
elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
sentences = tf.placeholder('string', shape=(None, None))
text_len = tf.placeholder('int32', shape=(None))

lm_embeddings = elmo(
    inputs={
        "tokens": sentences,
        "sequence_len": text_len
    },
        signature="tokens", as_dict=True)

word_emb = tf.expand_dims(lm_embeddings["word_emb"], 3)  # [B, slen, 512]
lm_emb_op = tf.concat([
        tf.concat([word_emb, word_emb], 2),  # [B, slen, 1024, 1]
        tf.expand_dims(lm_embeddings["lstm_outputs1"], 3),
Beispiel #10
0
import util

if __name__ == "__main__":
    if len(sys.argv) > 1:
        name = sys.argv[1]
    else:
        name = os.environ["EXP"]
    config = util.get_config("experiments.conf")[name]
    report_frequency = config["report_frequency"]

    config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))
    util.print_config(config)

    if "GPU" in os.environ:
        gpus = [int(g) for g in os.environ["GPU"].split(",")]
        util.set_gpus(*gpus)
    else:
        util.set_gpus()

    data = LSGNData(config)
    model = SRLModel(data, config)
    saver = tf.train.Saver()
    init_op = tf.global_variables_initializer()

    log_dir = config["log_dir"]
    assert not ("final" in name
                )  # Make sure we don't override a finalized checkpoint.
    writer = tf.summary.FileWriter(log_dir, flush_secs=20)

    # Create a "supervisor", which oversees the training process.
    sv = tf.train.Supervisor(logdir=log_dir,
Beispiel #11
0
                                          sentences: sent,
                                          text_len: slen
                                    }
                              )
                              sentence_id = docid + '_' + str(j)
                              print sentence_id
                              ds = fout.create_dataset(
                                    sentence_id, lm_emb.shape[1:], dtype='float32',
                                    data=lm_emb[0, :, :, :]  # [slen, lm_size, lm_layers]
                              )
                  fout.close  


#### Model #####

set_gpus(sys.argv[1]) # set the gpu id
elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
sentences = tf.placeholder('string', shape=(None, None))
text_len = tf.placeholder('int32', shape=(None))

lm_embeddings = elmo(
    inputs={
        "tokens": sentences,
        "sequence_len": text_len
    },
        signature="tokens", as_dict=True)

word_emb = tf.expand_dims(lm_embeddings["word_emb"], 3)  # [B, slen, 512]
lm_emb_op = tf.concat([
        tf.concat([word_emb, word_emb], 2),  # [B, slen, 1024, 1]
        tf.expand_dims(lm_embeddings["lstm_outputs1"], 3),
Beispiel #12
0
                    fl += 1
            else:
                if antecedent == (-1, -1):
                    fn += 1
                elif span_cluster_id != gold_to_cluster_id[i][antecedent]:
                    wl += 1
                else:
                    correct += 1

    return num_clusters, num_singular_clusters, num_plural_clusters, num_mixed_clusters, num_mixed_ambiguous, fl, fn, wl, correct, \
           num_non_gold, num_total_spans, s_to_p, p_to_s


if __name__ == '__main__':
    gpu_id = 6
    util.set_gpus(gpu_id)

    experiments = [('train_spanbert_large_ee', 'May14_06-02-15'),
                   ('train_spanbert_large_ee', 'May14_06-05-42'),
                   ('train_spanbert_large_lr2e-4_ee', 'May14_06-03-24'),
                   ('train_spanbert_large_lr2e-4_ee', 'May14_06-10-51')]

    results_final = None
    for experiment in experiments:
        results = analyze(*experiment)
        if results_final is None:
            results_final = results
        else:
            results_final = [r + results[i] for i, r in enumerate(results_final)]

        # print('%s_%s: # clusters: %d; # singular clusters: %d; # plural clusters: %d; # mixed clusters: %d; '
Beispiel #13
0
import os
import sys
sys.path.append(os.getcwd())
import collections
import operator

import numpy as np
import tensorflow as tf
import coref_model as cm
import util
import conll
import metrics

if __name__ == "__main__":
  if "GPU" in os.environ:
    util.set_gpus(int(os.environ["GPU"]))
  else:
    util.set_gpus()

  names = sys.argv[1:]
  print("Ensembling models from {}.".format(names))

  configs = util.get_config("experiments.conf")

  main_config = configs[names[0]]
  model = cm.CorefModel(main_config)
  model.load_eval_data()

  saver = tf.train.Saver()

  with tf.Session() as session:
Beispiel #14
0
  if len(sys.argv) > 1:
    name = sys.argv[1]
  else:
    name = os.environ["EXP"]
  config = util.get_config("experiments.conf")[name]
  print 'config'
  report_frequency = config["report_frequency"]

  config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))
  util.print_config(config)
  print os.environ
  # if "GPU" in os.environ:
  #   gpus = [int(g) for g in os.environ["GPU"].split(",")]
  #   util.set_gpus(*gpus)
  # else:
  util.set_gpus(config['gpu'])

  data = LSGNData(config)
  coref_config = copy.deepcopy(config)
  coref_config['train_path'] = config['train_path_coref']
  coref_config['lm_path'] = config['lm_path_coref']
  coref_config['eval_path'] = config['eval_path_coref']
  coref_config['lm_path_dev'] = config['lm_path_dev_coref']
  coref_config['ner_weight'] = 0
  coref_config['coref_weight'] = 1
  coref_config['relation_weight'] = 0
  # coref_config['batch_size'] = 30
  coref_config['coref_depth'] = 0
  model = SRLModel(data, config)
  if config['coref_freq']:
    coref_data = LSGNData(coref_config)
Beispiel #15
0
sys.path.append(os.getcwd())
import time
import random
import shutil

import numpy as np
import tensorflow as tf
import coref_model as cm
import util

def copy_checkpoint(source, target):
  for ext in (".index", ".data-00000-of-00001"):
    shutil.copyfile(source + ext, target + ext)

if __name__ == "__main__":
  util.set_gpus()

  if len(sys.argv) > 1:
    name = sys.argv[1]
    print("Running experiment: {} (from command-line argument).".format(name))
  else:
    name = os.environ["EXP"]
    print("Running experiment: {} (from environment variable).".format(name))

  config = util.get_config("experiments.conf")[name]
  config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))

  util.print_config(config)
  model = cm.CorefModel(config)

  saver = tf.train.Saver()
Beispiel #16
0
        name = sys.argv[1]
    else:
        name = os.environ["EXP"]

    config = util.get_config("experiments.conf")[name]
    print('config')
    report_frequency = config["report_frequency"]

    config["log_dir"] = util.mkdirs(os.path.join(config["log_root"], name))
    util.print_config(config)
    print((os.environ))
    # if "GPU" in os.environ:
    #   gpus = [int(g) for g in os.environ["GPU"].split(",")]
    #   util.set_gpus(*gpus)
    # else:
    util.set_gpus(config['gpu_id'])

    data = LSGNData(config)
    model = SRLModel(data, config)
    saver = tf.train.Saver()
    init_op = tf.global_variables_initializer()

    log_dir = config["log_dir"]
    assert not ("final" in name
                )  # Make sure we don't override a finalized checkpoint.
    writer = tf.summary.FileWriter(log_dir, flush_secs=20)

    # Create a "supervisor", which oversees the training process.
    sv = tf.train.Supervisor(logdir=log_dir,
                             init_op=init_op,
                             saver=saver,