Ejemplo n.º 1
0
def main(args):
    config = FileUtils.load_config(args.config)
    elastic_term = ElasticCache(config["text_index"])
    lambdas = config.get("lambdas", [0.9, 0.1])

    queries = json.load(open(config["query_file"], "r"))
    mappings = json.load(open(config["mapping_file"], "r"))
    annots = load_annot(config["annot_file"])
    run = load_run(config["run_file"])

    instances = Instances()
    # gets the results
    out_file = open(config["output_file"], "w")
    qid_int = 0
    for qid, query in sorted(queries.items()):
        print("Scoring ", qid, "...")
        results, libsvm_str = {}, ""
        query_len = len(elastic_term.analyze_query(query).split())
        scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid],
                           query_len, lambdas)
        for doc_id, p_T_d in sorted(run[qid].items()):
            query_mappings = get_mapping_query(annots[qid], mappings)
            p_E_d = scorer.score_doc(doc_id, query_mappings)
            properties = {
                'doc_id': doc_id,
                'query': query,
                'qid': qid,
                'qid_int': qid_int
            }
            features = {'p_T_d': p_T_d, 'p_E_d': p_E_d}
            ins = Instance(qid + "_" + doc_id,
                           features=features,
                           properties=properties)
            instances.add_instance(ins)
            # libsvm_str += ins.to_libsvm(qid_prop="qod_int")
            results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d)
        qid_int += 1

        # Write trec format
        out_str = trec_format(results, qid, "elr")
        out_file.write(out_str)

    out_file.close()
    print("Output file:", config["output_file"])
    instances.to_json(config["json_file"])
    print("Output file:", config["json_file"])
Ejemplo n.º 2
0
class FeaturesTermStats():
    CONTENT_FIELD = "content"
    STOPWORDS = [
        "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
        "in", "into", "is", "it", "no", "not", "of", "on", "or", "such",
        "that", "the", "their", "then", "there", "these", "they", "this", "to",
        "was", "will", "with"
    ]

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def get_per_nat_tf(self, person_id, nats):
        """
        Compute freqPerNat: \frac{|\{s : pe \in s, nt \in s\}|}{|S(pe)|}
        :param person_id:
        :param nats: nationality+adj, e.g. Germany, German
        :return: freqPerNat
        """

        body = {"query": {"bool": {"must": {"term": {"content": person_id}}}}}

        doc_ids = self.__elastic.search_complex(body,
                                                self.CONTENT_FIELD,
                                                num=10000).keys()
        n_s_pe = len(doc_ids)  # number of sentences containing person
        # print(n_s_pe)
        noun = nats[0]
        noun_query = self.__elastic.analyze_query(noun)

        body = {
            "query": {
                "bool": {
                    "must": [{
                        "match": {
                            "content": person_id
                        }
                    }, {
                        "match_phrase": {
                            "content": noun_query
                        }
                    }]
                }
            }
        }

        doc_ids_noun = self.__elastic.search_complex(body,
                                                     self.CONTENT_FIELD,
                                                     num=10000).keys()
        n_co_noun = len(doc_ids_noun)
        # print("Noun", n_co_noun)
        adj = nats[1]
        adj_query = self.__elastic.analyze_query(adj)

        body = {
            "query": {
                "bool": {
                    "must": [{
                        "match": {
                            "content": person_id
                        }
                    }, {
                        "match_phrase": {
                            "content": adj_query
                        }
                    }]
                }
            }
        }
        doc_ids_adj = self.__elastic.search_complex(body,
                                                    self.CONTENT_FIELD,
                                                    num=10000).keys()
        n_co_adj = len(doc_ids_adj)
        # print("Adj", n_co_adj)

        if n_s_pe == 0:
            return 0.0, 0.0
        else:
            return n_co_noun / n_s_pe, n_co_adj / n_s_pe

    def generate_features(self, kb_file, output_file1, output_file2):
        """Generate features of freq-person-nationality"""

        fout1 = open(output_file1, "w")
        fout2 = open(output_file2, "w")

        # write tsv header
        header = ["person", "nationality", "freq_person_nationality_noun"]
        fout1.write("\t".join(header) + "\n")
        header = ["person", "nationality", "freq_person_nationality_adj"]
        fout2.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            line_count = 1
            for line in kb_f:
                print(line_count)
                line_count += 1
                person_id, nat_id, noun, adj = line.strip().split("\t")
                values_noun = [person_id, nat_id]
                values_adj = [person_id, nat_id]
                nats = [noun, adj]
                fpn_noun, fpn_adj = self.get_per_nat_tf(person_id, nats)
                values_noun.append(str(fpn_noun))
                values_adj.append(str(fpn_adj))
                fout1.write("\t".join(values_noun) + "\n")
                fout2.write("\t".join(values_adj) + "\n")
        fout1.close()
        fout2.close()
Ejemplo n.º 3
0
class Retrieval(object):
    """Loads config file, checks params, and sets default values.

    :param config: retrieval config (JSON config file or a dictionary) of the shape:
    ::
        {
            "index_name": name of the index,
            "first_pass": {
                "num_docs": number of documents in first-pass scoring (default: 1000)
                "field": field used in first pass retrieval (default: Elastic.FIELD_CATCHALL)
                "fields_return": comma-separated list of fields to return for each hit (default: "")
            },
            "num_docs": number of documents to return (default: 100)
            "start": starting offset for ranked documents (default:0)
            "model": name of retrieval model; accepted values: [lm, mlm, prms] (default: lm)
            "field": field name for LM (default: catchall)
            "fields": list of fields for PRMS (default: [catchall])
            "field_weights": dictionary with fields and corresponding weights for MLM (default: {catchall: 1})
            "smoothing_method": accepted values: [jm, dirichlet] (default: dirichlet)
            "smoothing_param": value of lambda or mu; accepted values: [float or "avg_len"],
                                (jm default: 0.1, dirichlet default: 2000)

            "query_file": name of query file (JSON),
            "output_file": name of output file,
            "run_id": run id for TREC output
        }
    """
    FIELDED_MODELS = {"mlm", "prms"}
    LM_MODELS = {"lm", "mlm", "prms"}

    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)

    @staticmethod
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("num_docs", None) is None:
                config["first_pass"]["num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) is None:
                config["model"] = None
            if config.get("field", None) is None:
                config["field"] = Elastic.FIELD_CATCHALL
            if config.get("fields", None) is None:
                config["fields"] = [Elastic.FIELD_CATCHALL]
            if config.get("field_weights", None) is None:
                config["field_weights"] = {Elastic.FIELD_CATCHALL: 1}
            if config["model"] in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)

    def _first_pass_scoring(self, analyzed_query):
        """Returns first-pass scoring of documents.

        :param analyzed_query: analyzed query
        :return: RetrievalResults object
        """
        print("\tFirst pass scoring... ", )
        # todo: add support for other similarities
        # body = {"query": {
        #     "bool": {
        #         "should": [
        #             {"match": {
        #                 "catchall": {
        #                     "query": analyzed_query
        #                 }}},
        #             {"match": {
        #                 "names": {
        #                     "query": analyzed_query,
        #                     "boost": 3
        #                 }}}]}}}
        # self.__elastic.update_similarity(self.__first_pass_model, self.__first_pass_model_params)
        res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs,
                                     fields_return=self.__first_pass_fields_return)
        # res1 = self.__elastic.search_complex(body=body, num=self.__first_pass_num_docs,
        #                              fields_return=self.__first_pass_fields_return)
        return res1

    def _second_pass_scoring(self, res1, scorer):
        """Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        print("\tSecond pass scoring... ", )
        res2 = {}
        for doc_id in res1.keys():
            res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})}
        print("done")
        return res2

    def retrieve(self, query, scorer=None):
        """Scores documents for the given query."""
        query = self.__elastic.analyze_query(query)

        # 1st pass retrieval
        res1 = self._first_pass_scoring(query)
        if self.__model is None:
            return res1

        # 2nd pass retrieval
        scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config)
        res2 = self._second_pass_scoring(res1, scorer)
        return res2

    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            print("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        print("Output file:", self.__output_file)

    def trec_format(self, results, query_id, max_rank=100):
        """Outputs results in TREC format"""
        out_str = ""
        rank = 1
        for doc_id, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
            if rank > max_rank:
                break
            out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score) + "\t" + self.__run_id + "\n"
            rank += 1
        return out_str
Ejemplo n.º 4
0
class Retrieval(object):
    FIELDED_MODELS = {"mlm", "prms"}
    LM_MODELS = {"lm", "mlm", "prms"}

    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["1st_num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)

    @staticmethod
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("1st_num_docs", None) is None:
                config["first_pass"]["1st_num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")

            if config.get("model", None) == "lm":
                if config.get("fields", None) is None:
                    config["fields"] = Elastic.FIELD_CATCHALL
            if config.get("model", None) == "mlm":
                if config.get("fields", None) is None:
                    config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8}
            if config.get("model", None) == "prms":
                if config.get("fields", None) is None:
                    config["fields"] = [Elastic.FIELD_CATCHALL]
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)

    def __get_fields(self):
        """Returns the name of all fields that will be used in the retrieval model."""
        fields = []
        if type(self.__config["fields"]) == str:
            fields.append(self.__config["fields"])
        elif type(self.__config["fields"]) == dict:
            fields = self.__config["fields"].keys()
        else:
            fields = self.__config["fields"]
        return fields


    def _first_pass_scoring(self, analyzed_query):
        """Returns first-pass scoring of documents.

        :param analyzed_query: analyzed query
        :return: RetrievalResults object
        """
        PLOGGER.debug("\tFirst pass scoring... ", )
        res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs,
                                     fields_return=self.__first_pass_fields_return)
        return res1

    def _second_pass_scoring(self, res1, scorer):
        """Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        PLOGGER.debug("\tSecond pass scoring... ", )
        for field in self.__get_fields():
            self.__elastic.multi_termvector(list(res1.keys()), field)

        res2 = {}
        for doc_id in res1.keys():
            res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})}
        PLOGGER.debug("done")
        return res2

    def retrieve(self, query, scorer=None):
        """Scores documents for the given query."""
        query = self.__elastic.analyze_query(query)

        # 1st pass retrieval
        res1 = self._first_pass_scoring(query)
        if self.__model == "bm25":
            return res1

        # 2nd pass retrieval
        scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config)
        res2 = self._second_pass_scoring(res1, scorer)
        return res2

    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            PLOGGER.info("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        PLOGGER.info("Output file:" + self.__output_file)

    def trec_format(self, results, query_id, max_rank=100):
        """Outputs results in TREC format"""
        out_str = ""
        rank = 1
        for doc_id, score in sorted(results.items(), key=lambda x: x[1]["score"], reverse=True):
            if rank > max_rank:
                break
            out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score["score"]) + "\t" + self.__run_id + "\n"
            rank += 1
        return out_str