Beispiel #1
0
    def run(self):
        # load training instances
        ins_train = Instances.from_json(self.__config["training_set"])

        # Cross Validation
        if "cross_validation" in self.__config:
            cv = CrossValidation(self.__config["cross_validation"]["k"],
                                 ins_train, self.train_model, self.apply_model)
            split_strategy = self.__config["cross_validation"].get(
                "split_strategy", None)
            split_file = self.__config["cross_validation"]["splits_file"]
            # Always creates new splits if the create_flag is True
            if bool(self.__config["cross_validation"].get(
                    "create_splits", False)) is True:
                cv.create_folds(split_strategy)
                cv.save_folds(split_file)
            # New splits will be created only if the provided split_file does not exist.
            else:
                cv.get_folds(split_file, split_strategy)
            inss = cv.run()

        # classic test-train split
        else:
            ins_test = Instances.from_json(self.__config["test_set"])
            model = self.train_model(ins_train)
            inss = self.apply_model(ins_test, model)

        # output results (which are stored in inss)
        inss.to_json(self.__config["output_file"])
Beispiel #2
0
    def gen_train_set(gt, query_file, train_set):
        """Trains LTR model for entity linking."""
        entity, elastic, fcache = Entity(), ElasticCache(
            ELASTIC_INDICES[0]), FeatureCache()
        inss = Instances()
        positive_annots = set()

        # Adds groundtruth instances (positive instances)
        PLOGGER.info("Adding groundtruth instances (positive instances) ....")
        for item in sorted(gt):  # qid, query, en_id, mention
            ltr = LTR(Query(item[1], item[0]), entity, elastic, fcache)
            ins = ltr.__gen_raw_ins(item[2], item[3])
            ins.features = ltr.get_features(ins)
            ins.target = 1
            inss.add_instance(ins)
            positive_annots.add((item[0], item[2]))

        # Adds all other instances
        PLOGGER.info("Adding all other instances (negative instances) ...")
        for qid, q in sorted(json.load(open(query_file, "r")).items()):
            PLOGGER.info("Query [" + qid + "]")
            ltr = LTR(Query(q, qid), entity, elastic, fcache)
            q_inss = ltr.get_candidate_inss()
            for ins in q_inss.get_all():
                if (qid, ins.get_property("en_id")) in positive_annots:
                    continue
                ins.target = 0
                inss.add_instance(ins)
        inss.to_json(train_set)
Beispiel #3
0
 def prune_by_score(self, query_inss):
     """ prunes based on a static threshold of ranking score."""
     valid_inss = []
     for ins in query_inss.get_all():
         if ins.score >= self.__score_th:
             valid_inss.append(ins)
     return Instances(valid_inss)
Beispiel #4
0
    def train(self):
        """Trains the model."""
        self.__ml_config["save_model"] = self.__model_file

        ml = ML(self.__ml_config)
        ins_train = Instances.from_json(self.__train_file)
        ml.train_model(ins_train)
Beispiel #5
0
    def _disambiguation(self, query, cand):
        """

        :param query:
        :type query: Query
        :return:
        """
        # create instances
        inss = Instances()
        for c in cand:
            ins_id = "-".join([query.qid, "entity_id", "mention"])
            # TODO we need a 'normalized' (lowercased, special chars removed, etc.) but not fully
            # TODO prepocessed query here instead of the raw query
            prop = {
                'qid': query.qid,
                'query': query.raw_query,
                'entity_id': c['entity_id'],
                'mention': c['mention']
            }
            ins = Instance(ins_id, properties=prop)
            # get features
            ins.features = self.get_features(ins)

        # apply model
        return self.apply_model(inss)
Beispiel #6
0
 def __items2inss(self, person_items):
     """Converts (person, item) tuples to instances."""
     all_inss = Instances.from_json(self.__inss_file)
     print("Converting items to instances ...")
     inss = Instances()
     for person, item in person_items:
         person_id = self.__wsdmcup_ids.get_id_from_person(person)
         if self.__relation == REL_PROFESSION:
             item_id = self.__wsdmcup_ids.get_id_from_prof(item)
         else:
             item_id = self.__wsdmcup_ids.get_id_from_nation(item)
         ins_id = person_id + "_" + item_id
         ins = all_inss.get_instance(ins_id)
         if ins is None:
             print(person, item, "not found!")
         inss.add_instance(ins)
     return inss
Beispiel #7
0
    def train(config):
        LTR.__check_config(config)
        if config.get("gen_training_set", False):
            gt = LTR.load_yerd(config["ground_truth"])
            LTR.gen_train_set(gt, config["query_file"], config["training_set"])

        instances = Instances.from_json(config["training_set"])
        ML(config).train_model(instances)
Beispiel #8
0
    def get_candidate_inss(self):
        """Detects mentions and their candidate entities (with their commoness scores) and generates instances

        :return: Instances object
        """
        instances = Instances()
        for ngram in self.__query.get_ngrams():
            cand_ens = Mention(ngram, self.__entity,
                               self.__cmns_th).get_cand_ens()
            for en_id, commonness in cand_ens.items():
                if not is_name_entity(en_id):
                    continue
                self.__fcache.set_feature_val("commonness",
                                              en_id + "_" + ngram, commonness)
                ins = self.__gen_raw_ins(en_id, ngram)
                ins.features = self.get_features(ins, cand_ens)
                instances.add_instance(ins)
        return instances
Beispiel #9
0
    def run(self):
        """Runs cross-validation."""

        # if folds haven't been initialized/created before (w/ get_folds or create_folds)
        # then they'll be created using the default grouping (i.e., based on instance_id)
        if self.folds is None:
            self.create_folds()

        # this holds the estimated target values (and also the confidence score, if available)
        test_inss = Instances()

        for i, fold in enumerate(self.folds):
            print("=======================================")
            print("Cross validation for fold " + str(i) + " ...")
            model = self.callback_train(self.get_instances(fold, "training"))
            fold_test_inss = self.callback_test(self.get_instances(fold, "testing"), model)
            test_inss.append_instances(fold_test_inss.get_all())

        return test_inss
Beispiel #10
0
    def batch_linking(self):
        """Scores queries in a batch and outputs results."""
        results = {}

        if self.__config["step"] == "linking":
            queries = json.load(open(self.__query_file))
            for qid in sorted(queries):
                results[qid] = self.link(queries[qid], qid)
            json.dump(results,
                      open(self.__output_file, "w"),
                      indent=4,
                      sort_keys=True)

        # only ranking step
        if self.__config["step"] == "ranking":
            queries = json.load(open(self.__query_file))
            for qid in sorted(queries):
                linker = self.__get_linker(Query(queries[qid], qid))
                results[qid] = linker.rank_ens()
            ranked_inss = Instances(
                sum([inss.get_all() for inss in results.values()], []))
            ranked_inss.to_json(self.__output_file)

        # only disambiguation step
        if self.__config["step"] == "disambiguation":
            inss = Instances.from_json(self.__config["test_set"])
            inss_by_query = inss.group_by_property("qid")
            for qid, q_inss in sorted(inss_by_query.items()):
                linker = self.__get_linker("")
                results[qid] = linker.disambiguate(Instances(q_inss))
            to_elq_eval(results, self.__output_file)

        PLOGGER.info("Output file: " + self.__output_file)
Beispiel #11
0
    def prune_containment_mentions(self, query_inss):
        """Deletes containment mentions, if they have lower score."""
        if len(query_inss.get_all()) == 0:
            return query_inss

        valid_inss = [] #dict()  # {mention: ins}
        valid_mens = set()
        for ins in sorted(query_inss.get_all(), key=lambda item: item.score, reverse=True):
            is_contained = False
            cand_men = ins.get_property("mention")
            for men in valid_mens:
                if (cand_men != men) and ((cand_men in men) or (men in cand_men)):
                    is_contained = True
            if not is_contained:
                # valid_inss[ins.get_property("mention")] = ins
                valid_inss.append(ins)
                valid_mens.add(ins.get_property("mention"))  # @todo: This line should be fixed
        return Instances(valid_inss) #list(valid_inss.values()))
Beispiel #12
0
def main(args):
    config = FileUtils.load_config(args.config)
    elastic_term = ElasticCache(config["text_index"])
    lambdas = config.get("lambdas", [0.9, 0.1])

    queries = json.load(open(config["query_file"], "r"))
    mappings = json.load(open(config["mapping_file"], "r"))
    annots = load_annot(config["annot_file"])
    run = load_run(config["run_file"])

    instances = Instances()
    # gets the results
    out_file = open(config["output_file"], "w")
    qid_int = 0
    for qid, query in sorted(queries.items()):
        print("Scoring ", qid, "...")
        results, libsvm_str = {}, ""
        query_len = len(elastic_term.analyze_query(query).split())
        scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid],
                           query_len, lambdas)
        for doc_id, p_T_d in sorted(run[qid].items()):
            query_mappings = get_mapping_query(annots[qid], mappings)
            p_E_d = scorer.score_doc(doc_id, query_mappings)
            properties = {
                'doc_id': doc_id,
                'query': query,
                'qid': qid,
                'qid_int': qid_int
            }
            features = {'p_T_d': p_T_d, 'p_E_d': p_E_d}
            ins = Instance(qid + "_" + doc_id,
                           features=features,
                           properties=properties)
            instances.add_instance(ins)
            # libsvm_str += ins.to_libsvm(qid_prop="qod_int")
            results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d)
        qid_int += 1

        # Write trec format
        out_str = trec_format(results, qid, "elr")
        out_file.write(out_str)

    out_file.close()
    print("Output file:", config["output_file"])
    instances.to_json(config["json_file"])
    print("Output file:", config["json_file"])
Beispiel #13
0
    def get_instances(self, i, mode, property=None):
        """
        Returns instances from the given fold i \in [0..k-1].

        :param i: fold number
        :param mode: training or testing
        :return Instances object
        """
        inss = Instances()
        if property:
            inss_by_prop = self.__instances.group_by_property(property)
            for l in self.folds[i][mode]:
                for ins in inss_by_prop[l]:
                    inss.add_instance(ins)
        else:
            for l in self.folds[i][mode]:
                inss.add_instance(self.__instances.get_instance(l))
        return inss