Ejemplo n.º 1
0
class FeaturesTermStats():
    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K_VALUES = [10, 50, 100, 200, 500, 1000]
    STOPWORDS = [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
        "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having",
        "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
        "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during",
        "before", "after", "above", "below", "to", "from", "up", "down", "in",
        "out", "on", "off", "over", "under", "again", "further", "then",
        "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
        "t", "can", "will", "just", "don", "should", "now"
    ]
    MAX_K = max(K_VALUES)

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def load_termstats(self, input_file):
        """load term statistics from file"""
        self.__stats = {}
        with FileUtils.open_file_by_type(input_file) as f_in:
            rank = 0
            last_prof = None
            for line in f_in:
                prof, term, tf, df, tfidf = line.strip().split("\t")
                if prof != last_prof:
                    rank = 0
                    last_prof = prof
                rank += 1
                if term in self.STOPWORDS:  # filter stopwords
                    continue
                if term.startswith("fb_"):  # filter entity terms
                    continue
                if prof not in self.__stats:
                    self.__stats[prof] = {}
                self.__stats[prof][term] = {
                    "tf": int(tf),
                    "df": int(df),
                    "tfidf": float(tfidf),
                    "rank": rank
                }

    def get_person_tf(self, person_id):
        """
        Get aggregated TF for a person
        :param person_id: dict with TFs
        :return:
        """
        doc_ids = self.__elastic.search(person_id,
                                        self.CONTENT_FIELD,
                                        num=10000).keys()
        print(person_id, "with", len(doc_ids), "sentences")
        tf_agg = {}
        for doc_id in doc_ids:
            tv = self.__elastic.get_termvector(
                doc_id, self.CONTENT_FIELD)  # , term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
        return tf_agg, len(doc_ids)

    def generate_features(self, kb_file, output_file):
        """Generating features related to term statistics"""

        fout = open(output_file, "w")

        # write tsv header
        header = ["person", "profession"]
        for k in self.K_VALUES:
            header.append("sumProfTerms_" + str(k))
            header.append("simCos_" + str(k))
        fout.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            for line in kb_f:
                person_id, prof_id = line.strip().split("\t")
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr)
                    # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)}
                    sum_prof_terms = 0
                    for term, tf in person_tf.items():
                        pt = self.__stats.get(prof_id, {}).get(term, {})
                        if pt.get("rank",
                                  100000) > k:  # skip term if not in top-K
                            continue
                        sum_prof_terms += tf * pt.get("tfidf", 0)
                    values.append(str(sum_prof_terms))

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    vec_pr = []  # construct prof vector
                    vec_pe = []  # construct person vector

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                vec_pr.append(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                vec_pe.append(person_tf.get(term, 0) * idf)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                fout.write("\t".join(values) + "\n")
                print(values)

        fout.close()
Ejemplo n.º 2
0
class Retrieval(object):
    FIELDED_MODELS = {"mlm", "prms"}
    LM_MODELS = {"lm", "mlm", "prms"}

    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["1st_num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)

    @staticmethod
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("1st_num_docs", None) is None:
                config["first_pass"]["1st_num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")

            if config.get("model", None) == "lm":
                if config.get("fields", None) is None:
                    config["fields"] = Elastic.FIELD_CATCHALL
            if config.get("model", None) == "mlm":
                if config.get("fields", None) is None:
                    config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8}
            if config.get("model", None) == "prms":
                if config.get("fields", None) is None:
                    config["fields"] = [Elastic.FIELD_CATCHALL]
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)

    def __get_fields(self):
        """Returns the name of all fields that will be used in the retrieval model."""
        fields = []
        if type(self.__config["fields"]) == str:
            fields.append(self.__config["fields"])
        elif type(self.__config["fields"]) == dict:
            fields = self.__config["fields"].keys()
        else:
            fields = self.__config["fields"]
        return fields


    def _first_pass_scoring(self, analyzed_query):
        """Returns first-pass scoring of documents.

        :param analyzed_query: analyzed query
        :return: RetrievalResults object
        """
        PLOGGER.debug("\tFirst pass scoring... ", )
        res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs,
                                     fields_return=self.__first_pass_fields_return)
        return res1

    def _second_pass_scoring(self, res1, scorer):
        """Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        PLOGGER.debug("\tSecond pass scoring... ", )
        for field in self.__get_fields():
            self.__elastic.multi_termvector(list(res1.keys()), field)

        res2 = {}
        for doc_id in res1.keys():
            res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})}
        PLOGGER.debug("done")
        return res2

    def retrieve(self, query, scorer=None):
        """Scores documents for the given query."""
        query = self.__elastic.analyze_query(query)

        # 1st pass retrieval
        res1 = self._first_pass_scoring(query)
        if self.__model == "bm25":
            return res1

        # 2nd pass retrieval
        scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config)
        res2 = self._second_pass_scoring(res1, scorer)
        return res2

    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            PLOGGER.info("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        PLOGGER.info("Output file:" + self.__output_file)

    def trec_format(self, results, query_id, max_rank=100):
        """Outputs results in TREC format"""
        out_str = ""
        rank = 1
        for doc_id, score in sorted(results.items(), key=lambda x: x[1]["score"], reverse=True):
            if rank > max_rank:
                break
            out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score["score"]) + "\t" + self.__run_id + "\n"
            rank += 1
        return out_str
Ejemplo n.º 3
0
class Retrieval(object):
    """Loads config file, checks params, and sets default values.

    :param config: retrieval config (JSON config file or a dictionary) of the shape:
    ::
        {
            "index_name": name of the index,
            "first_pass": {
                "num_docs": number of documents in first-pass scoring (default: 1000)
                "field": field used in first pass retrieval (default: Elastic.FIELD_CATCHALL)
                "fields_return": comma-separated list of fields to return for each hit (default: "")
            },
            "num_docs": number of documents to return (default: 100)
            "start": starting offset for ranked documents (default:0)
            "model": name of retrieval model; accepted values: [lm, mlm, prms] (default: lm)
            "field": field name for LM (default: catchall)
            "fields": list of fields for PRMS (default: [catchall])
            "field_weights": dictionary with fields and corresponding weights for MLM (default: {catchall: 1})
            "smoothing_method": accepted values: [jm, dirichlet] (default: dirichlet)
            "smoothing_param": value of lambda or mu; accepted values: [float or "avg_len"],
                                (jm default: 0.1, dirichlet default: 2000)

            "query_file": name of query file (JSON),
            "output_file": name of output file,
            "run_id": run id for TREC output
        }
    """
    FIELDED_MODELS = {"mlm", "prms"}
    LM_MODELS = {"lm", "mlm", "prms"}

    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)

    @staticmethod
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("num_docs", None) is None:
                config["first_pass"]["num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) is None:
                config["model"] = None
            if config.get("field", None) is None:
                config["field"] = Elastic.FIELD_CATCHALL
            if config.get("fields", None) is None:
                config["fields"] = [Elastic.FIELD_CATCHALL]
            if config.get("field_weights", None) is None:
                config["field_weights"] = {Elastic.FIELD_CATCHALL: 1}
            if config["model"] in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)

    def _first_pass_scoring(self, analyzed_query):
        """Returns first-pass scoring of documents.

        :param analyzed_query: analyzed query
        :return: RetrievalResults object
        """
        print("\tFirst pass scoring... ", )
        # todo: add support for other similarities
        # body = {"query": {
        #     "bool": {
        #         "should": [
        #             {"match": {
        #                 "catchall": {
        #                     "query": analyzed_query
        #                 }}},
        #             {"match": {
        #                 "names": {
        #                     "query": analyzed_query,
        #                     "boost": 3
        #                 }}}]}}}
        # self.__elastic.update_similarity(self.__first_pass_model, self.__first_pass_model_params)
        res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs,
                                     fields_return=self.__first_pass_fields_return)
        # res1 = self.__elastic.search_complex(body=body, num=self.__first_pass_num_docs,
        #                              fields_return=self.__first_pass_fields_return)
        return res1

    def _second_pass_scoring(self, res1, scorer):
        """Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        print("\tSecond pass scoring... ", )
        res2 = {}
        for doc_id in res1.keys():
            res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})}
        print("done")
        return res2

    def retrieve(self, query, scorer=None):
        """Scores documents for the given query."""
        query = self.__elastic.analyze_query(query)

        # 1st pass retrieval
        res1 = self._first_pass_scoring(query)
        if self.__model is None:
            return res1

        # 2nd pass retrieval
        scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config)
        res2 = self._second_pass_scoring(res1, scorer)
        return res2

    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            print("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        print("Output file:", self.__output_file)

    def trec_format(self, results, query_id, max_rank=100):
        """Outputs results in TREC format"""
        out_str = ""
        rank = 1
        for doc_id, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
            if rank > max_rank:
                break
            out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score) + "\t" + self.__run_id + "\n"
            rank += 1
        return out_str
Ejemplo n.º 4
0
class FeaturesW2VSim(object):
    """Implements our simCosW2V feature, i.e., the cosine similarity between the profession and person vectors, \
    where the profession (resp. person) vector is the centroid of TFIDF-weighted word2vec vectors of top-K profession \
    (resp. person) terms.
    """

    # Formula for feature computation:
    #
    # $cos(\vec{t}^{w2v}_{pe, k}, \vec{t}^{w2v}_{pr, k})$, where for item $\in \{pe, pr\}$:
    # $$\vec{t}^{w2v}_{item, k} = \sum_{t \in T_k(item)} w(t, item) w2v(t)$$
    # (note that using these unnormalized sums in the computation of $cos()$ is equivalent to use the actual centroids).

    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K_VALUES = [10, 50, 100, 200, 500, 1000]
    MAX_K = max(K_VALUES)

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def load_termstats(self, input_file):
        self.__stats = {}
        with FileUtils.open_file_by_type(input_file) as f_in:
            rank = 0
            last_prof = None
            for line in f_in:
                prof, term, tf, df, tfidf = line.strip().split("\t")
                if prof != last_prof:
                    rank = 0
                    last_prof = prof
                rank += 1
                if term in STOPWORDS:  # filter stopwords
                    continue
                if term.startswith("fb_"):  # filter entity terms
                    continue
                if prof not in self.__stats:
                    self.__stats[prof] = {}
                self.__stats[prof][term] = {
                    "tf": int(tf),
                    "df": int(df),
                    "tfidf": float(tfidf),
                    "rank": rank
                }

    def get_person_tf(self, person_id):
        """Get aggregated TF for a person.

        :param person_id: dict with TFs.
        :return:
        """
        doc_ids = self.__elastic.search(person_id,
                                        self.CONTENT_FIELD,
                                        num=10000).keys()

        tf_agg = {}
        for doc_id in doc_ids:
            tv = self.__elastic.get_termvector(
                doc_id, self.CONTENT_FIELD)  # , term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
        return tf_agg, len(doc_ids)

    def generate_features(self, kb_file, output_file):
        """Core function for generating into output_file the features, with person-item data from kb_file.

        :param kb_file: path to the file with person items (a '.kb'-extension file).
        :param output_file:
        :return:
        """
        feat_w2v_approx = FeaturesW2VSimApprox()

        with open(output_file, "w") as f_out:
            # write tsv header
            header = ["person_id", "prof_id"]
            for k in self.K_VALUES:
                header.append("simCos_w2v_" + str(k))
            f_out.write("\t".join(header) + "\n")

            for line in FileUtils.read_file_as_list(kb_file):
                person_id, prof_id = line.split(
                    "\t")  # strip() done in read_file_as_list()
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    term_weights_pr = {
                    }  # dict from top-K profession terms to their tfidf weights
                    term_weights_pe = {
                    }  # dict from top-K person terms to their tfidf weights

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                term_weights_pr[term] = float(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                term_weights_pe[term] = person_tf.get(term,
                                                                      0) * idf

                        vec_pr = feat_w2v_approx.get_vector(term_weights_pr)
                        vec_pe = feat_w2v_approx.get_vector(term_weights_pe)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                f_out.write("\t".join(values) + "\n")