Exemple #1
0
            WikiTitles.append(wits)

        return WikiTitles

    def widIdx2WikiTitle(self, widIdx):
        wid = self.idx2knwid[widIdx]
        wikiTitle = self.wid2WikiTitle[wid]
        return wikiTitle


if __name__ == '__main__':
    sttime = time.time()
    batch_size = 1
    num_batch = 1000
    configpath = "configs/config.ini"
    config = Config(configpath, verbose=False)
    vocabloader = VocabLoader(config)
    b = TestDataReader(config=config,
                       vocabloader=vocabloader,
                       test_mens_file=config.test_file,
                       num_cands=30,
                       batch_size=batch_size,
                       strict_context=False,
                       pretrain_wordembed=False,
                       coherence=False)

    stime = time.time()

    i = 0
    kwn = 0
    total_instances = 0
    def addToTypeCount(self, mens):
        for m in mens:
            types = m.types
            for t in types:
                self.typeCount[t] += 1

    def convertTypeCountToFraction(self, decSorted, numMens):
        typeCount = []
        for (t, c) in decSorted:
            typeCount.append((t, float(c) / numMens))
        return typeCount

    def makeLabelCount(self):
        totalMentions = 0
        for (i, mens_file) in enumerate(self.tr_mens_files):
            print("File Num : {}".format(i))
            file = os.path.join(self.tr_mens_dir, mens_file)
            mens = utils.make_mentions_from_file(file)
            self.addToTypeCount(mens)
            totalMentions += len(mens)
        decSorted = utils.decrSortedDict(self.typeCount)
        decSorted = self.convertTypeCountToFraction(decSorted, totalMentions)
        print("Total Mentions : {}".format(totalMentions))
        pp.pprint(decSorted)


if __name__ == '__main__':
    config = Config("configs/vocab_config.ini")
    vocabloader = VocabLoader(config)
    a = typeCountDistribution(config, vocabloader)
Exemple #3
0
import os
import readers.utils as utils
from readers.Mention import Mention
from readers.config import Config
from readers.vocabloader import VocabLoader

config = Config("configs/all_mentions_config.ini")
vocabloader = VocabLoader(config)

(knwid2idx, idx2knwid) = vocabloader.getKnwnWidVocab()
wid2WikiTitle = vocabloader.getWID2Wikititle()

print("Known {} total {}".format(len(knwid2idx), len(wid2WikiTitle)))

widswithtext = set()
with open("/save/ngupta19/wikipedia/wiki_kb/widswithtext", 'r') as f:
    docswithtext =  f.readlines()
    for l in docswithtext:
        widswithtext.add(l.strip())

print("Total docs with text : {}".format(len(widswithtext)))

missing = 0
total = 0
for wid in knwid2idx:
    total += 1
    if not (str(wid) in widswithtext):
        print(wid)
        missing += 1

print("Known Total : {} Missing {} ".format(total, missing))
Exemple #4
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    FLAGS_check(FLAGS)

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    FLAGS.dropout_keep_prob = 1.0
    FLAGS.wordDropoutKeep = 1.0
    FLAGS.cohDropoutKeep = 1.0

    (intput_ta_files, output_ta_files) = getAllTAFilePaths(FLAGS)

    print("TOTAL NUMBER OF TAS : {}".format(len(intput_ta_files)))

    reader = TextAnnoTestReader(
        config=config,
        vocabloader=vocabloader,
        # test_mens_file=config.test_file,
        num_cands=30,
        batch_size=FLAGS.batch_size,
        strict_context=FLAGS.strict_context,
        pretrain_wordembed=FLAGS.pretrain_wordembed,
        coherence=FLAGS.coherence)
    model_mode = 'test'

    config_proto = tf.ConfigProto()
    config_proto.allow_soft_placement = True
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    with sess.as_default():
        model = ELModel(
            sess=sess,
            reader=reader,
            dataset=FLAGS.dataset,
            max_steps=FLAGS.max_steps,
            pretrain_max_steps=FLAGS.pretraining_steps,
            word_embed_dim=FLAGS.word_embed_dim,
            context_encoded_dim=FLAGS.context_encoded_dim,
            context_encoder_num_layers=FLAGS.context_encoder_num_layers,
            context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
            coherence_numlayers=FLAGS.coherence_numlayers,
            jointff_numlayers=FLAGS.jointff_numlayers,
            learning_rate=FLAGS.learning_rate,
            dropout_keep_prob=FLAGS.dropout_keep_prob,
            reg_constant=FLAGS.reg_constant,
            checkpoint_dir=FLAGS.checkpoint_dir,
            optimizer=FLAGS.optimizer,
            mode=model_mode,
            strict=FLAGS.strict_context,
            pretrain_word_embed=FLAGS.pretrain_wordembed,
            typing=FLAGS.typing,
            el=FLAGS.el,
            coherence=FLAGS.coherence,
            textcontext=FLAGS.textcontext,
            useCNN=FLAGS.useCNN,
            WDLength=FLAGS.WDLength,
            Fsize=FLAGS.Fsize,
            entyping=FLAGS.entyping)

        model.load_ckpt_model(ckptpath=FLAGS.model_path)

        print("Total files: {}".format(len(output_ta_files)))
        erroneous_files = 0
        for in_ta_path, out_ta_path in zip(intput_ta_files, output_ta_files):
            # print("Running the inference for : {}".format(in_ta_path))
            try:
                reader.new_test_file(in_ta_path)
            except:
                print("Error reading : {}".format(in_ta_path))
                erroneous_files += 1
                continue

            (predTypScNPmat_list, widIdxs_list, priorProbs_list,
             textProbs_list, jointProbs_list, evWTs_list,
             pred_TypeSetsList) = model.inference_run()

            # model.inference(ckptpath=FLAGS.model_path)

            wiki_view = copy.deepcopy(reader.textanno.get_view("NER_LORELEI"))
            #wiki_view = copy.deepcopy(reader.textanno.get_view("MENTION"))
            #wiki_view = copy.deepcopy(reader.textanno.get_view("NER_CONLL"))
            #wiki_view = copy.deepcopy(reader.textanno.get_view("English_NERVIEW"))
            # wiki_view_json = copy.deepcopy(reader.textanno.get_view("NER").as_json)
            docta = reader.textanno

            #el_cons_list = [con for con in wiki_view.cons_list if "NAM" in con['label']]
            el_cons_list = wiki_view.cons_list
            numMentionsInference = len(widIdxs_list)

            # print("Number of mentions in model: {}".format(len(widIdxs_list)))
            # print("Number of NER mention: {}".format(len(el_cons_list)))

            assert len(el_cons_list) == numMentionsInference

            mentionnum = 0
            for ner_cons in el_cons_list:
                priorScoreMap = {}
                contextScoreMap = {}
                jointScoreMap = {}

                (wididxs, pps, mps,
                 jps) = (widIdxs_list[mentionnum], priorProbs_list[mentionnum],
                         textProbs_list[mentionnum],
                         jointProbs_list[mentionnum])

                maxJointProb = 0.0
                maxJointEntity = ""
                for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps):
                    wT = reader.widIdx2WikiTitle(wididx)
                    priorScoreMap[wT] = prp
                    contextScoreMap[wT] = mp
                    jointScoreMap[wT] = jp

                    if jp > maxJointProb:
                        maxJointProb = jp
                        maxJointEntity = wT
                ''' add labels2score map here '''
                #ner_cons["jointScoreMap"] = jointScoreMap
                #ner_cons["contextScoreMap"] = contextScoreMap
                #ner_cons["priorScoreMap"] = priorScoreMap

                ner_cons["labelScoreMap"] = jointScoreMap
                # add max scoring entity as label
                ner_cons["label"] = maxJointEntity
                ner_cons["score"] = maxJointProb

                mentionnum += 1

            wiki_view.view_name = "NEUREL"
            docta.view_dictionary["NEUREL"] = wiki_view

            docta_json = docta.as_json
            json.dump(docta_json, open(out_ta_path, "w"), indent=True)

        print("Number of erroneous files: {}".format(erroneous_files))
        print("Annotation completed. Program can be exited safely.")
    sys.exit()
Exemple #5
0
                sys.exit()
            print("Loading wid2TypeLabels")
            self.wid2TypeLabels = utils.load(
                self.config.wid2typelabels_vocab_pkl)
        return self.wid2TypeLabels

    def loadGloveVectors(self):
        if self.glove2vec == None:
            if not os.path.exists(self.config.glove_pkl):
                print("Glove_Vectors_PKL doesnot exist")
                sys.exit()
            print("Loading Glove Word Vectors")
            self.glove2vec = utils.load(self.config.glove_pkl)
        return self.glove2vec

    def getGloveWordVocab(self):
        if self.gword2idx == None or self.gidx2word == None:
            if not os.path.exists(self.config.glove_word_vocab_pkl):
                print("Glove Word Vocab PKL missing")
                sys.exit()
            print("Loading Glove Word Vocabulary")
            (self.gword2idx,
             self.gidx2word) = utils.load(self.config.glove_word_vocab_pkl)
        return (self.gword2idx, self.gidx2word)


if __name__ == '__main__':
    config = Config("configs/wcoh_config.ini")
    a = VocabLoader(config)
    a.loadWord2Vec()
Exemple #6
0
    def __init__(self):
        print("[#] Launching Training Job. DeviceId : {}".format(device_id))
        self.config = Config(configpath, verbose=False)
        self.vocabloader = VocabLoader(self.config)
        test_file = self.config.aida_kwn_dev_file
        valbs = 2

        self.tr_reader = TrainingDataReader(
            config=self.config,
            vocabloader=self.vocabloader,
            val_file=self.config.aida_kwn_dev_file,
            num_cands=30,
            batch_size=bs,
            wordDropoutKeep=worddropout,
            cohDropoutKeep=cohdropout)
        self.test_reader = TestDataReader(config=self.config,
                                          vocabloader=self.vocabloader,
                                          val_file=test_file,
                                          num_cands=30,
                                          batch_size=valbs,
                                          wordDropoutKeep=1.0,
                                          cohDropoutKeep=1.0)

        self.wvocab_size = len(self.tr_reader.word2idx)
        self.envocab_size = len(self.tr_reader.knwid2idx)
        self.typevocab_size = len(self.tr_reader.label2idx)
        self.cohvocab_size = len(self.tr_reader.cohG92idx)

        print("[#] Word Vocab : {}, Entity Vocab: {}, Type Vocab: {} "
              "CohString Vocab : {}".format(self.wvocab_size,
                                            self.envocab_size,
                                            self.typevocab_size,
                                            self.cohvocab_size))

        if modeltype == 'ELModel':
            print("[#] MODEL : ELModel")
            self.model = ELModel(
                device_id=device_id,
                wordvocab=(self.tr_reader.word2idx, self.tr_reader.idx2word),
                cohvocab=(self.tr_reader.cohG92idx, self.tr_reader.idx2cohG9),
                envocab=(self.tr_reader.knwid2idx, self.tr_reader.idx2knwid),
                typevocab=(self.tr_reader.label2idx, self.tr_reader.idx2label),
                wdim=wdim,
                edim=endim,
                num_cands=30,
                hsize=endim,
                mlp_nlayers=1,
                dropout=dropout,
                init_range=init_range,
                mentyping=mentype,
                entyping=entype,
                descencoding=endesc)
        if modeltype not in MODELTYPES:
            print("Invalid modeltype : {}".format(modeltype))
            sys.exit()

        if device_id is not None:
            self.model.cuda(device_id)

        if optim == 'adam':
            print("[#] OPTIM : ADAM")
            self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                     self.model.parameters()),
                                              lr=lr)
        elif optim == 'sgd':
            print("[#] OPTIM : SGD LR:{} Momentum:{} Nestrov:{}".format(
                lr, momentum, nestrov))
            self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                                    self.model.parameters()),
                                             lr=lr,
                                             momentum=momentum,
                                             nesterov=nestrov)
        else:
            print("Wrong Optimizer")
            sys.exit(0)

        timeout = 5
        print("Press any key to run (or wait {} seconds) ... ".format(timeout))
        rlist, wlist, xlist = select.select([sys.stdin], [], [], timeout)
Exemple #7
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    FLAGS_check(FLAGS)
    # pipeline = LocalPipeline()
    pipeline = RemotePipeline(server_api="http://macniece.seas.upenn.edu:4001")

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    FLAGS.dropout_keep_prob = 1.0
    FLAGS.wordDropoutKeep = 1.0
    FLAGS.cohDropoutKeep = 1.0

    input_jsonl = FLAGS.input_jsonl
    output_jsonl = FLAGS.output_jsonl

    reader = TextAnnoTestReader(
        config=config,
        vocabloader=vocabloader,
        num_cands=30,
        batch_size=FLAGS.batch_size,
        strict_context=FLAGS.strict_context,
        pretrain_wordembed=FLAGS.pretrain_wordembed,
        coherence=FLAGS.coherence,
    )
    model_mode = "test"

    config_proto = tf.ConfigProto()
    config_proto.allow_soft_placement = True
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    with sess.as_default():
        model = ELModel(
            sess=sess,
            reader=reader,
            dataset=FLAGS.dataset,
            max_steps=FLAGS.max_steps,
            pretrain_max_steps=FLAGS.pretraining_steps,
            word_embed_dim=FLAGS.word_embed_dim,
            context_encoded_dim=FLAGS.context_encoded_dim,
            context_encoder_num_layers=FLAGS.context_encoder_num_layers,
            context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
            coherence_numlayers=FLAGS.coherence_numlayers,
            jointff_numlayers=FLAGS.jointff_numlayers,
            learning_rate=FLAGS.learning_rate,
            dropout_keep_prob=FLAGS.dropout_keep_prob,
            reg_constant=FLAGS.reg_constant,
            checkpoint_dir=FLAGS.checkpoint_dir,
            optimizer=FLAGS.optimizer,
            mode=model_mode,
            strict=FLAGS.strict_context,
            pretrain_word_embed=FLAGS.pretrain_wordembed,
            typing=FLAGS.typing,
            el=FLAGS.el,
            coherence=FLAGS.coherence,
            textcontext=FLAGS.textcontext,
            useCNN=FLAGS.useCNN,
            WDLength=FLAGS.WDLength,
            Fsize=FLAGS.Fsize,
            entyping=FLAGS.entyping,
        )

        model.load_ckpt_model(ckptpath=FLAGS.model_path)

        outf = open(output_jsonl, "w")
        inpf = list(open(input_jsonl, "r"))

        for line in tqdm(inpf):
            example_json = json.loads(line)
            doctext = ftfy.fix_text(example_json['text'])
            out_dict = {
                'text': doctext,
                'qanta_id': example_json['qanta_id'],
                'sent_id': example_json['sent_id']
            }
            try:
                ta = pipeline.doc(doctext)
            except MisalignedCharError as e:
                out_dict['error'] = str(e)
                outf.write(json.dumps(out_dict))
                outf.write("\n")
                continue
            _ = ta.get_ner_conll

            # Make instances for this document
            reader.new_ta(ta)

            (
                predTypScNPmat_list,
                widIdxs_list,
                priorProbs_list,
                textProbs_list,
                jointProbs_list,
                evWTs_list,
                pred_TypeSetsList,
            ) = model.inference_run()
            if pred_TypeSetsList is None:
                continue

            wiki_view = copy.deepcopy(reader.textanno.get_view("NER_CONLL"))
            docta = reader.textanno

            el_cons_list = wiki_view.cons_list
            numMentionsInference = len(widIdxs_list)

            assert len(el_cons_list) == numMentionsInference

            el_mentions = []

            mentionnum = 0
            for ner_cons in el_cons_list:
                # ner_cons is a dict
                mentiondict = {}
                mentiondict["tokens"] = ner_cons["tokens"]
                mentiondict["end"] = ner_cons["end"]
                mentiondict["start"] = ner_cons["start"]

                priorScoreMap = {}
                contextScoreMap = {}
                jointScoreMap = {}

                (wididxs, pps, mps, jps) = (
                    widIdxs_list[mentionnum],
                    priorProbs_list[mentionnum],
                    textProbs_list[mentionnum],
                    jointProbs_list[mentionnum],
                )

                maxJointProb = 0.0
                maxJointEntity = ""
                for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps):
                    wT = reader.widIdx2WikiTitle(wididx)
                    priorScoreMap[wT] = prp
                    contextScoreMap[wT] = mp
                    jointScoreMap[wT] = jp

                    if jp > maxJointProb:
                        maxJointProb = jp
                        maxJointEntity = wT

                mentiondict["jointScoreMap"] = jointScoreMap
                mentiondict["contextScoreMap"] = contextScoreMap
                mentiondict["priorScoreMap"] = priorScoreMap

                # add max scoring entity as label
                mentiondict["label"] = maxJointEntity
                mentiondict["score"] = maxJointProb

                mentionnum += 1

                el_mentions.append(mentiondict)

            out_dict["nel"] = el_mentions
            outf.write(json.dumps(out_dict))
            outf.write("\n")

        outf.close()

        print("Annotation completed. Program can be exited safely.")
    sys.exit()
Exemple #8
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    output_file = "data/output.json"  #sys.argv[2]

    range_start = 0  #int(sys.argv[3])
    range_end = 10  #int(sys.argv[4])

    file_name = "data/qanta.train.2018.04.18.json"  #sys.argv[1]
    question_list = json.loads(open(file_name).read())["questions"]
    sentences = question_list[range_start:min(range_end, len(question_list))]

    FLAGS_check(FLAGS)

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    print("Loading in variables!")
    word2idx, idx2word = vocabloader.getGloveWordVocab()
    wid2WikiTitle = vocabloader.getWID2Wikititle()
    crosswikis = utils.load(config.crosswikis_pruned_pkl)
    word2vec = vocabloader.loadGloveVectors()
    print("DONE LOADING IN VARIABLES!!!")

    all_entities = []

    for sent in sentences:
        tf.reset_default_graph()
        loc = config.test_file.replace(
            "sampletest.txt", "{}_{}.txt".format(range_start, range_end))
        w = open(loc, "w")
        config.test_file = loc
        sent["text"] = decrypt(sent["text"].replace("\xa0", " "))
        w.write(sent["text"].encode("ascii", "ignore").decode("ascii"))
        print(sent["text"].encode("ascii", "ignore").decode("ascii"))
        w.close()
        FLAGS.dropout_keep_prob = 1.0
        FLAGS.wordDropoutKeep = 1.0
        FLAGS.cohDropoutKeep = 1.0
        start = time.time()
        print("Test file {} ".format(config.test_file))
        reader = InferenceReader(config=config,
                                 vocabloader=vocabloader,
                                 test_mens_file=config.test_file,
                                 num_cands=FLAGS.num_cand_entities,
                                 batch_size=FLAGS.batch_size,
                                 word2idx=word2idx,
                                 idx2word=idx2word,
                                 wid2WikiTitle=wid2WikiTitle,
                                 crosswikis=crosswikis,
                                 word2vec=word2vec,
                                 strict_context=FLAGS.strict_context,
                                 pretrain_wordembed=FLAGS.pretrain_wordembed,
                                 coherence=FLAGS.coherence)
        print("Took {} time to create inference reader".format(time.time() -
                                                               start))
        docta = reader.ccgdoc
        model_mode = 'inference'

        config_proto = tf.ConfigProto()
        config_proto.allow_soft_placement = True
        config_proto.gpu_options.allow_growth = True
        sess = tf.Session(config=config_proto)

        print("COHSTR", reader.num_cohstr)
        """with sess.as_default():

            start = time.time()
            model = ELModel(
                sess=sess, reader=reader, dataset=FLAGS.dataset,
                max_steps=FLAGS.max_steps,
                pretrain_max_steps=FLAGS.pretraining_steps,
                word_embed_dim=FLAGS.word_embed_dim,
                context_encoded_dim=FLAGS.context_encoded_dim,
                context_encoder_num_layers=FLAGS.context_encoder_num_layers,
                context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
                coherence_numlayers=FLAGS.coherence_numlayers,
                jointff_numlayers=FLAGS.jointff_numlayers,
                learning_rate=FLAGS.learning_rate,
                dropout_keep_prob=FLAGS.dropout_keep_prob,
                reg_constant=FLAGS.reg_constant,
                checkpoint_dir=FLAGS.checkpoint_dir,
                optimizer=FLAGS.optimizer,
                mode=model_mode,
                strict=FLAGS.strict_context,
                pretrain_word_embed=FLAGS.pretrain_wordembed,
                typing=FLAGS.typing,
                el=FLAGS.el,
                coherence=FLAGS.coherence,
                textcontext=FLAGS.textcontext,
                useCNN=FLAGS.useCNN,
                WDLength=FLAGS.WDLength,
                Fsize=FLAGS.Fsize,
                entyping=FLAGS.entyping)

            print("Loading EL Model took {} time".format(time.time()-start))

            print("Doing inference")

            try:
                start = time.time()
                (predTypScNPmat_list,
                widIdxs_list,
                priorProbs_list,
                textProbs_list,
                jointProbs_list,
                evWTs_list,
                pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path)
                print("Inference took {} time".format(time.time()-start))
            except:
                entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]}
                all_entities.append(entity_list)
                print("No entities")
                continue
 
            start = time.time()
            numMentionsInference = len(widIdxs_list)
            numMentionsReader = 0
            for sent_idx in reader.sentidx2ners:
                numMentionsReader += len(reader.sentidx2ners[sent_idx])
            assert numMentionsInference == numMentionsReader

            mentionnum = 0
            entityTitleList = []
            print("Tokenized sentences {}".format(reader.sentences_tokenized))
            for sent_idx in reader.sentidx2ners:
                nerDicts = reader.sentidx2ners[sent_idx]
                sentence = ' '.join(reader.sentences_tokenized[sent_idx])
                for s, ner in nerDicts:
                    [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum]
                    predTypes = pred_TypeSetsList[mentionnum]

                    entityTitleList.append(evWTs[2])
                    mentionnum += 1

            elview = copy.deepcopy(docta.view_dictionary['NER_CONLL'])
            elview.view_name = 'ENG_NEURAL_EL'
            for i, cons in enumerate(elview.cons_list):
                cons['label'] = entityTitleList[i]

            docta.view_dictionary['ENG_NEURAL_EL'] = elview

            print("Processing took {} time".format(time.time()-start))

            print("List of entities")
            #print(elview.cons_list)
            print("\n")
            
            s = sent["text"]
            print("New S is {}".format(s))
            e = elview.cons_list
            t = reader.sentences_tokenized 
            c = []
            f = []

            print(s)
            #print("E {}".format(e))
            print("T {}".format(t))

            for i in t:
                for j in i:
                    f.append(j)
            i = 0
            token_pointer = 0
            while token_pointer < len(f) and i < len(s):
                token_len = len(f[token_pointer])
                while i+token_len<len(s) and s[i:i+token_len] != f[token_pointer]:
                    i+=1
                c.append((i,token_len+i))
                i+=1
                token_pointer+=1
            if len(c) != len(f):
                print("ERROR in C and F")           
            unflattened_c = []
            c_pointer = 0
            for i in range(len(t)):
                l = c[c_pointer:c_pointer+len(t[i])]
                c_pointer+=len(t[i])
                unflattened_c.append(l)

            #print("C {}".format(c))
            #print("F {}".format(f))
            #print("Unflattened C {}".format(unflattened_c)) 

            entity_list = {'qanta_id':sent['qanta_id'],'mentions':[]}
            sentence_num = 0
               
            UNK = "<unk_wid>"
            for i in range(len(e)):
                if e[i]["label"]!=UNK:
                    all_words = False
                    while not all_words and sentence_num < len(t):
                        all_words = True
                        #print(e[i])
                        for word in range(e[i]["start"],e[i]["end"]+1):
                            if len(t[sentence_num])<=word or t[sentence_num][word] not in e[i]["tokens"]:
                                all_words = False
                        if not all_words:
                            sentence_num+=1
                    if sentence_num == len(t):
                        print("Error with sentence_num")
                    else:
                        entity_list['mentions'].append({'entity':e[i]["label"],'span':[unflattened_c[sentence_num][e[i]['start']][0],unflattened_c[sentence_num][e[i]['end']][1]]})
            #print("Entity list is {}".format(entity_list))

            all_entities.append(entity_list)
            local_vars = list(locals().items())
            del reader
     
            del predTypScNPmat_list
            del widIdxs_list
            del priorProbs_list
            del textProbs_list
            del jointProbs_list
            del evWTs_list
            del model
            del pred_TypeSetsList
            print("Memory usage {}".format(getCurrentMemoryUsage()))
            #print("All entities are {}".format(all_entities))
        del sess"""
        gc.collect()
        tf.reset_default_graph()

    w = open(output_file, "w")
    w.write(json.dumps(all_entities))
    w.close()

    print("Dumped JSON, all done")
    print("Took {} time".format(time.time() - prog_start))
    return
    sys.exit()
Exemple #9
0
        numCands = numCands / float(numMens)

        print("Training Known Candidates Stats : ")
        print("Number of mentions: {}".format(numMens))
        print("Recall @ 1 : {}".format(recallAt1))
        print("Recall @ 30 : {}".format(recallAt30))
        print("No Cands : {}".format(noCands))
        print("Correct WID not in Cands : {}".format(candsButNotCorr))
        print("Num of Cands : {}".format(numCands))

    #enddef


if __name__ == '__main__':
    sttime = time.time()
    batch_size = 1000
    num_batch = 1000
    configpath = "configs/wcoh_config.ini"
    config = Config(configpath)
    vocabloader = VocabLoader(config)
    b = CandidateStats(config=config, vocabloader=vocabloader, num_cands=30)

    stime = time.time()

    #b.allCandidateStats()
    #b.knwnCandsDictStats()
    b.validationKnwnCandsStats()
    b.testKnwnCandsStats()
    #b.trainKnwnCandsStats()
    sys.exit()
        mentions = utils.make_mentions_from_file(mens_file=test_file)
        self._addCandidatesForMentions(mentions, cands_dict)

        return cands_dict

    def updateTestCandsDict(self, test_file):

        print("Updating Test Candidates Dict. Size:{}\n"
              "Key:(surface, wid), V: ([CandWids], [PriorProbs])".format(
                  len(self.test_kwnen_cands_dict)))
        print("Test File: {}".format(test_file))
        test_cands_dict = self.make_test_candidates(test_file)

        self.test_kwnen_cands_dict.update(test_cands_dict)

        utils.save(self.config.test_kwnen_cands_pkl,
                   self.test_kwnen_cands_dict)
        print("Train/Val Candidates Dict Saved. Size:{}".format(
            len(self.test_kwnen_cands_dict)))


if __name__ == '__main__':
    config = Config("configs/config.ini")
    vocabloader = VocabLoader(config)
    b = TestCandidateDictionary(config=config, vocabloader=vocabloader)

    b.updateTestCandsDict(test_file=config.aida_kwn_dev_file)
    b.updateTestCandsDict(test_file=config.aida_kwn_test_file)

    sys.exit(0)
Exemple #11
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    FLAGS_check(FLAGS)

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    if FLAGS.mode == 'inference':
        FLAGS.dropout_keep_prob = 1.0
        FLAGS.wordDropoutKeep = 1.0
        FLAGS.cohDropoutKeep = 1.0

        reader = InferenceReader(config=config,
                                 vocabloader=vocabloader,
                                 test_mens_file=config.test_file,
                                 num_cands=FLAGS.num_cand_entities,
                                 batch_size=FLAGS.batch_size,
                                 strict_context=FLAGS.strict_context,
                                 pretrain_wordembed=FLAGS.pretrain_wordembed,
                                 coherence=FLAGS.coherence)
        docta = reader.ccgdoc
        model_mode = 'inference'

    elif FLAGS.mode == 'test':
        FLAGS.dropout_keep_prob = 1.0
        FLAGS.wordDropoutKeep = 1.0
        FLAGS.cohDropoutKeep = 1.0

        reader = TestDataReader(config=config,
                                vocabloader=vocabloader,
                                test_mens_file=config.test_file,
                                num_cands=30,
                                batch_size=FLAGS.batch_size,
                                strict_context=FLAGS.strict_context,
                                pretrain_wordembed=FLAGS.pretrain_wordembed,
                                coherence=FLAGS.coherence)
        model_mode = 'test'

    else:
        print("MODE in FLAGS is incorrect : {}".format(FLAGS.mode))
        sys.exit()

    config_proto = tf.ConfigProto()
    config_proto.allow_soft_placement = True
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    with sess.as_default():
        model = ELModel(
            sess=sess,
            reader=reader,
            dataset=FLAGS.dataset,
            max_steps=FLAGS.max_steps,
            pretrain_max_steps=FLAGS.pretraining_steps,
            word_embed_dim=FLAGS.word_embed_dim,
            context_encoded_dim=FLAGS.context_encoded_dim,
            context_encoder_num_layers=FLAGS.context_encoder_num_layers,
            context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
            coherence_numlayers=FLAGS.coherence_numlayers,
            jointff_numlayers=FLAGS.jointff_numlayers,
            learning_rate=FLAGS.learning_rate,
            dropout_keep_prob=FLAGS.dropout_keep_prob,
            reg_constant=FLAGS.reg_constant,
            checkpoint_dir=FLAGS.checkpoint_dir,
            optimizer=FLAGS.optimizer,
            mode=model_mode,
            strict=FLAGS.strict_context,
            pretrain_word_embed=FLAGS.pretrain_wordembed,
            typing=FLAGS.typing,
            el=FLAGS.el,
            coherence=FLAGS.coherence,
            textcontext=FLAGS.textcontext,
            useCNN=FLAGS.useCNN,
            WDLength=FLAGS.WDLength,
            Fsize=FLAGS.Fsize,
            entyping=FLAGS.entyping)

        if FLAGS.mode == 'inference':
            print("Doing inference")
            (predTypScNPmat_list, widIdxs_list, priorProbs_list,
             textProbs_list, jointProbs_list, evWTs_list,
             pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path)

            numMentionsInference = len(widIdxs_list)
            numMentionsReader = 0
            for sent_idx in reader.sentidx2ners:
                numMentionsReader += len(reader.sentidx2ners[sent_idx])
            assert numMentionsInference == numMentionsReader

            mentionnum = 0
            entityTitleList = []
            for sent_idx in reader.sentidx2ners:
                nerDicts = reader.sentidx2ners[sent_idx]
                sentence = ' '.join(reader.sentences_tokenized[sent_idx])
                for s, ner in nerDicts:
                    [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum]
                    predTypes = pred_TypeSetsList[mentionnum]
                    print(reader.bracketMentionInSentence(sentence, ner))
                    print("Prior: {} {}, Context: {} {}, Joint: {} {}".format(
                        evWTs[0], evProbs[0], evWTs[1], evProbs[1], evWTs[2],
                        evProbs[2]))

                    entityTitleList.append(evWTs[2])
                    print("Predicted Entity Types : {}".format(predTypes))
                    print("\n")
                    mentionnum += 1

            elview = copy.deepcopy(docta.view_dictionary['NER_CONLL'])
            elview.view_name = 'ENG_NEURAL_EL'
            for i, cons in enumerate(elview.cons_list):
                cons['label'] = entityTitleList[i]

            docta.view_dictionary['ENG_NEURAL_EL'] = elview

            print("elview.cons_list")
            print(elview.cons_list)
            print("\n")

            for v in docta.as_json['views']:
                print(v)
                print("\n")

        elif FLAGS.mode == 'test':
            print("Testing on Data ")
            (widIdxs_list, condProbs_list, contextProbs_list,
             condContextJointProbs_list, evWTs,
             sortedContextWTs) = model.dataset_test(ckptpath=FLAGS.model_path)

            print(len(widIdxs_list))
            print(len(condProbs_list))
            print(len(contextProbs_list))
            print(len(condContextJointProbs_list))
            print(len(reader.mentions))

            print("Writing Test Predictions: {}".format(FLAGS.test_out_fp))
            with open(FLAGS.test_out_fp, 'w') as f:
                for (wididxs, pps, mps,
                     jps) in zip(widIdxs_list, condProbs_list,
                                 contextProbs_list,
                                 condContextJointProbs_list):

                    mentionPred = ""

                    for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps):
                        wit = reader.widIdx2WikiTitle(wididx)
                        mentionPred += wit + " " + str(prp) + " " + \
                            str(mp) + " " + str(jp)
                        mentionPred += "\t"

                    mentionPred = mentionPred.strip() + "\n"

                    f.write(mentionPred)

            print("Done writing. Can Exit.")

        else:
            print("WRONG MODE!")
            sys.exit(0)

    sys.exit()
Exemple #12
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    FLAGS_check(FLAGS)

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    if FLAGS.mode == 'inference':
        FLAGS.dropout_keep_prob = 1.0
        FLAGS.wordDropoutKeep = 1.0
        FLAGS.cohDropoutKeep = 1.0

        reader = InferenceReader(config=config,
                                 vocabloader=vocabloader,
                                 test_mens_file=config.test_file,
                                 num_cands=FLAGS.num_cand_entities,
                                 batch_size=FLAGS.batch_size,
                                 strict_context=FLAGS.strict_context,
                                 pretrain_wordembed=FLAGS.pretrain_wordembed,
                                 coherence=FLAGS.coherence)
        model_mode = 'test'
    else:
        print("MODE in FLAGS is incorrect : {}".format(FLAGS.mode))
        sys.exit()

    config_proto = tf.ConfigProto()
    config_proto.allow_soft_placement = True
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    with sess.as_default():
        model = ELModel(
            sess=sess,
            reader=reader,
            dataset=FLAGS.dataset,
            max_steps=FLAGS.max_steps,
            pretrain_max_steps=FLAGS.pretraining_steps,
            word_embed_dim=FLAGS.word_embed_dim,
            context_encoded_dim=FLAGS.context_encoded_dim,
            context_encoder_num_layers=FLAGS.context_encoder_num_layers,
            context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
            coherence_numlayers=FLAGS.coherence_numlayers,
            jointff_numlayers=FLAGS.jointff_numlayers,
            learning_rate=FLAGS.learning_rate,
            dropout_keep_prob=FLAGS.dropout_keep_prob,
            reg_constant=FLAGS.reg_constant,
            checkpoint_dir=FLAGS.checkpoint_dir,
            optimizer=FLAGS.optimizer,
            mode=model_mode,
            strict=FLAGS.strict_context,
            pretrain_word_embed=FLAGS.pretrain_wordembed,
            typing=FLAGS.typing,
            el=FLAGS.el,
            coherence=FLAGS.coherence,
            textcontext=FLAGS.textcontext,
            useCNN=FLAGS.useCNN,
            WDLength=FLAGS.WDLength,
            Fsize=FLAGS.Fsize,
            entyping=FLAGS.entyping)

        if FLAGS.mode == 'inference':
            print("Doing inference")
            (predTypScNPmat_list, widIdxs_list, priorProbs_list,
             textProbs_list, evWTs_list,
             pred_TypeSetsList) = model.inference(ckptpath=FLAGS.model_path)

            numMentionsInference = len(widIdxs_list)
            numMentionsReader = 0
            for sent_idx in reader.sentidx2ners:
                numMentionsReader += len(reader.sentidx2ners[sent_idx])
            assert numMentionsInference == numMentionsReader

            mentionnum = 0
            for sent_idx in reader.sentidx2ners:
                nerDicts = reader.sentidx2ners[sent_idx]
                sentence = ' '.join(reader.sentences_tokenized[sent_idx])
                for s, ner in nerDicts:
                    [evWTs, evWIDS, evProbs] = evWTs_list[mentionnum]
                    predTypes = pred_TypeSetsList[mentionnum]
                    print(reader.bracketMentionInSentence(sentence, ner))
                    print("Prior: {} {}, Context: {} {}, Joint: {} {}".format(
                        evWTs[0], evProbs[0], evWTs[1], evProbs[1], evWTs[2],
                        evProbs[2]))
                    print("Predicted Entity Types : {}".format(predTypes))
                    print("\n")
                    mentionnum += 1

        else:
            print("WRONG MODE!")
            sys.exit(0)
    sys.exit()
Exemple #13
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    FLAGS_check(FLAGS)

    config = Config(FLAGS.config, verbose=False)
    vocabloader = VocabLoader(config)

    FLAGS.dropout_keep_prob = 1.0
    FLAGS.wordDropoutKeep = 1.0
    FLAGS.cohDropoutKeep = 1.0

    input_jsonl = FLAGS.input_jsonl
    output_jsonl = FLAGS.output_jsonl
    doc_key = FLAGS.doc_key

    reader = TextAnnoTestReader(config=config,
                                vocabloader=vocabloader,
                                num_cands=30,
                                batch_size=FLAGS.batch_size,
                                strict_context=FLAGS.strict_context,
                                pretrain_wordembed=FLAGS.pretrain_wordembed,
                                coherence=FLAGS.coherence)
    model_mode = 'test'

    config_proto = tf.ConfigProto()
    config_proto.allow_soft_placement = True
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    with sess.as_default():
        model = ELModel(
            sess=sess,
            reader=reader,
            dataset=FLAGS.dataset,
            max_steps=FLAGS.max_steps,
            pretrain_max_steps=FLAGS.pretraining_steps,
            word_embed_dim=FLAGS.word_embed_dim,
            context_encoded_dim=FLAGS.context_encoded_dim,
            context_encoder_num_layers=FLAGS.context_encoder_num_layers,
            context_encoder_lstmsize=FLAGS.context_encoder_lstmsize,
            coherence_numlayers=FLAGS.coherence_numlayers,
            jointff_numlayers=FLAGS.jointff_numlayers,
            learning_rate=FLAGS.learning_rate,
            dropout_keep_prob=FLAGS.dropout_keep_prob,
            reg_constant=FLAGS.reg_constant,
            checkpoint_dir=FLAGS.checkpoint_dir,
            optimizer=FLAGS.optimizer,
            mode=model_mode,
            strict=FLAGS.strict_context,
            pretrain_word_embed=FLAGS.pretrain_wordembed,
            typing=FLAGS.typing,
            el=FLAGS.el,
            coherence=FLAGS.coherence,
            textcontext=FLAGS.textcontext,
            useCNN=FLAGS.useCNN,
            WDLength=FLAGS.WDLength,
            Fsize=FLAGS.Fsize,
            entyping=FLAGS.entyping)

        model.load_ckpt_model(ckptpath=FLAGS.model_path)

        erroneous_files = 0

        outf = open(output_jsonl, 'w')
        inpf = open(input_jsonl, 'r')

        for line in inpf:
            jsonobj = json.loads(line)
            doctext = jsonobj[doc_key]
            ta = localpipeline.doc(doctext, pretokenized=FLAGS.pretokenized)
            _ = ta.get_ner_conll

            # Make instances for this document
            reader.new_ta(ta)

            (predTypScNPmat_list, widIdxs_list, priorProbs_list,
             textProbs_list, jointProbs_list, evWTs_list,
             pred_TypeSetsList) = model.inference_run()

            wiki_view = copy.deepcopy(reader.textanno.get_view("NER_CONLL"))
            docta = reader.textanno

            el_cons_list = wiki_view.cons_list
            numMentionsInference = len(widIdxs_list)

            assert len(el_cons_list) == numMentionsInference

            out_dict = {doc_key: doctext}
            el_mentions = []

            mentionnum = 0
            for ner_cons in el_cons_list:
                # ner_cons is a dict
                mentiondict = {}
                mentiondict['tokens'] = ner_cons['tokens']
                mentiondict['end'] = ner_cons['end']
                mentiondict['start'] = ner_cons['start']

                priorScoreMap = {}
                contextScoreMap = {}
                jointScoreMap = {}

                (wididxs, pps, mps,
                 jps) = (widIdxs_list[mentionnum], priorProbs_list[mentionnum],
                         textProbs_list[mentionnum],
                         jointProbs_list[mentionnum])

                maxJointProb = 0.0
                maxJointEntity = ""
                for (wididx, prp, mp, jp) in zip(wididxs, pps, mps, jps):
                    wT = reader.widIdx2WikiTitle(wididx)
                    priorScoreMap[wT] = prp
                    contextScoreMap[wT] = mp
                    jointScoreMap[wT] = jp

                    if jp > maxJointProb:
                        maxJointProb = jp
                        maxJointEntity = wT

                mentiondict["jointScoreMap"] = jointScoreMap
                mentiondict["contextScoreMap"] = contextScoreMap
                mentiondict["priorScoreMap"] = priorScoreMap

                # add max scoring entity as label
                mentiondict["label"] = maxJointEntity
                mentiondict["score"] = maxJointProb

                mentionnum += 1

                el_mentions.append(mentiondict)

            out_dict['nel'] = el_mentions
            outstr = json.dumps(out_dict)
            outf.write(outstr)
            outf.write("\n")

        outf.close()
        inpf.close()

        print("Number of erroneous files: {}".format(erroneous_files))
        print("Annotation completed. Program can be exited safely.")
    sys.exit()