Beispiel #1
0
    def _disambiguation(self, query, cand):
        """

        :param query:
        :type query: Query
        :return:
        """
        # create instances
        inss = Instances()
        for c in cand:
            ins_id = "-".join([query.qid, "entity_id", "mention"])
            # TODO we need a 'normalized' (lowercased, special chars removed, etc.) but not fully
            # TODO prepocessed query here instead of the raw query
            prop = {
                'qid': query.qid,
                'query': query.raw_query,
                'entity_id': c['entity_id'],
                'mention': c['mention']
            }
            ins = Instance(ins_id, properties=prop)
            # get features
            ins.features = self.get_features(ins)

        # apply model
        return self.apply_model(inss)
Beispiel #2
0
    def __gen_raw_ins(self, en_id, mention):
        """Generates an instance without features"""
        ins_id = self.__query.qid + "_" + en_id + "_" + mention
        index = self.__query.qid.rfind("_")
        session = self.__query.qid[:index] if index != -1 else self.__query.qid

        ins = Instance(ins_id)
        ins.add_property("qid", self.__query.qid)
        ins.add_property("query", self.__query.query)
        ins.add_property("en_id", en_id)
        ins.add_property("mention", mention)
        ins.add_property("session", session)
        return ins
Beispiel #3
0
    def from_json(cls, json_file):
        """Loads instances from a JSON file.

        :param json_file: (string)
        :return Instances object
        """
        print("Reading JSON file " + json_file + " ...")
        json_data = open(json_file)
        data = json.load(json_data)
        instance_list = []
        # read instances
        for ins_id, fields in data.items():
            instance = Instance.from_json(ins_id, fields)
            instance_list.append(instance)
        return cls(instance_list)
Beispiel #4
0
    def __init__(self, instances=None):
        """
        :param instances: instances in a list or dict
            - if list then list index is used as the instance ID
            - if dict then the key is used as the instance ID
        """
        self.__instances = {}

        if type(instances) == list:
            for ins in instances:
                self.add_instance(ins)

        elif type(instances) == dict:
            for ins_id, fields in instances.items():
                instance = Instance.from_json(ins_id, fields)
                self.add_instance(instance)
Beispiel #5
0
def main(args):
    config = FileUtils.load_config(args.config)
    elastic_term = ElasticCache(config["text_index"])
    lambdas = config.get("lambdas", [0.9, 0.1])

    queries = json.load(open(config["query_file"], "r"))
    mappings = json.load(open(config["mapping_file"], "r"))
    annots = load_annot(config["annot_file"])
    run = load_run(config["run_file"])

    instances = Instances()
    # gets the results
    out_file = open(config["output_file"], "w")
    qid_int = 0
    for qid, query in sorted(queries.items()):
        print("Scoring ", qid, "...")
        results, libsvm_str = {}, ""
        query_len = len(elastic_term.analyze_query(query).split())
        scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid],
                           query_len, lambdas)
        for doc_id, p_T_d in sorted(run[qid].items()):
            query_mappings = get_mapping_query(annots[qid], mappings)
            p_E_d = scorer.score_doc(doc_id, query_mappings)
            properties = {
                'doc_id': doc_id,
                'query': query,
                'qid': qid,
                'qid_int': qid_int
            }
            features = {'p_T_d': p_T_d, 'p_E_d': p_E_d}
            ins = Instance(qid + "_" + doc_id,
                           features=features,
                           properties=properties)
            instances.add_instance(ins)
            # libsvm_str += ins.to_libsvm(qid_prop="qod_int")
            results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d)
        qid_int += 1

        # Write trec format
        out_str = trec_format(results, qid, "elr")
        out_file.write(out_str)

    out_file.close()
    print("Output file:", config["output_file"])
    instances.to_json(config["json_file"])
    print("Output file:", config["json_file"])
Beispiel #6
0
    def __load_from_tsv(self, tsv_file, type, params):
        """Loads instances from a TSV file.

        :param tsv_file: name of the TSV file
        :param type: type of the data: "properties", "features" or "target"
        :param params: list of columns mapped to properties or features
        """
        with open(tsv_file, "rb") as tsvfile:
            reader = csv.DictReader(tsvfile,
                                    delimiter="\t",
                                    quoting=csv.QUOTE_NONE)
            # print "Processing gold file with following fields:\n" + str(reader.fieldnames)

            # Checks all the params are in the TSV file header
            if set(params) != set(reader.fieldnames[1:]):
                raise Exception("TSV header does not match params \"" +
                                ",".join(params) + "\" in file:\n\t" +
                                tsv_file)

            # Reads tsv lines
            for line in reader:
                ins_id = line["id"]

                # Generating instance
                if ins_id in self.__instances:  # existing instance
                    ins = self.get_instance(ins_id)
                else:  # new instance
                    ins = Instance(ins_id)
                    self.add_instance(ins)

                # adding params
                for param in params:
                    if type == "properties":
                        ins.add_property(param, line[param])
                    elif type == "features":
                        ins.add_feature(param, line[param])
                    elif type == "target":
                        ins.target = line[param]