Example #1
0
    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)
Example #2
0
    def __type_centric(self, query):
        """Type-centric TTI.

        :param query: query string
        :type query: str
        """
        types = dict()
        model = self.__config.get("model", TTI_MODEL_BM25)
        elastic = ElasticCache(
            self.__tc_config.get("index", DEFAULT_TTI_TC_INDEX))

        if model == TTI_MODEL_BM25:
            PLOGGER.info("TTI, TC, BM25")
            self.__tc_config["model"] = "bm25"
            # scorer = Scorer.get_scorer(elastic, query, self.__tc_config)
            types = Retrieval(self.__tc_config).retrieve(query)

        elif model == TTI_MODEL_LM:
            PLOGGER.debug("TTI, TC, LM")
            self.__tc_config["model"] = "lm"  # Needed for 2nd-pass
            self.__tc_config["field"] = "content"  # Needed for 2nd-pass
            self.__tc_config["second_pass"] = {"field": "content"}
            for param in ["smoothing_method", "smoothing_param"]:
                if self.__config.get(param, None) is not None:
                    self.__tc_config["second_pass"][param] = self.__config.get(
                        param)

            scorer = Scorer.get_scorer(elastic, query, self.__tc_config)
            types = Retrieval(self.__tc_config).retrieve(query, scorer)

            PLOGGER.info(types)

        return types
Example #3
0
    def gen_train_set(gt, query_file, train_set):
        """Trains LTR model for entity linking."""
        entity, elastic, fcache = Entity(), ElasticCache(
            ELASTIC_INDICES[0]), FeatureCache()
        inss = Instances()
        positive_annots = set()

        # Adds groundtruth instances (positive instances)
        PLOGGER.info("Adding groundtruth instances (positive instances) ....")
        for item in sorted(gt):  # qid, query, en_id, mention
            ltr = LTR(Query(item[1], item[0]), entity, elastic, fcache)
            ins = ltr.__gen_raw_ins(item[2], item[3])
            ins.features = ltr.get_features(ins)
            ins.target = 1
            inss.add_instance(ins)
            positive_annots.add((item[0], item[2]))

        # Adds all other instances
        PLOGGER.info("Adding all other instances (negative instances) ...")
        for qid, q in sorted(json.load(open(query_file, "r")).items()):
            PLOGGER.info("Query [" + qid + "]")
            ltr = LTR(Query(q, qid), entity, elastic, fcache)
            q_inss = ltr.get_candidate_inss()
            for ins in q_inss.get_all():
                if (qid, ins.get_property("en_id")) in positive_annots:
                    continue
                ins.target = 0
                inss.add_instance(ins)
        inss.to_json(train_set)
Example #4
0
def main(args):
    config = FileUtils.load_config(args.config)
    elastic_term = ElasticCache(config["text_index"])
    lambdas = config.get("lambdas", [0.9, 0.1])

    queries = json.load(open(config["query_file"], "r"))
    mappings = json.load(open(config["mapping_file"], "r"))
    annots = load_annot(config["annot_file"])
    run = load_run(config["run_file"])

    instances = Instances()
    # gets the results
    out_file = open(config["output_file"], "w")
    qid_int = 0
    for qid, query in sorted(queries.items()):
        print("Scoring ", qid, "...")
        results, libsvm_str = {}, ""
        query_len = len(elastic_term.analyze_query(query).split())
        scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid],
                           query_len, lambdas)
        for doc_id, p_T_d in sorted(run[qid].items()):
            query_mappings = get_mapping_query(annots[qid], mappings)
            p_E_d = scorer.score_doc(doc_id, query_mappings)
            properties = {
                'doc_id': doc_id,
                'query': query,
                'qid': qid,
                'qid_int': qid_int
            }
            features = {'p_T_d': p_T_d, 'p_E_d': p_E_d}
            ins = Instance(qid + "_" + doc_id,
                           features=features,
                           properties=properties)
            instances.add_instance(ins)
            # libsvm_str += ins.to_libsvm(qid_prop="qod_int")
            results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d)
        qid_int += 1

        # Write trec format
        out_str = trec_format(results, qid, "elr")
        out_file.write(out_str)

    out_file.close()
    print("Output file:", config["output_file"])
    instances.to_json(config["json_file"])
    print("Output file:", config["json_file"])
Example #5
0
 def __get_scorer(self, query):
     """Factory method to get entity retrieval method."""
     model = self.__config.get("model", None)
     if model == "elr":
         scorer = ELR(self.__config)
     else:  # from core.retrieval
         elastic = ElasticCache(self.__config["index_name"])
         scorer = Scorer.get_scorer(elastic, query, self.__config)
     return scorer
Example #6
0
File: er.py Project: zxlzr/nordlys
def main(args):
    config = FileUtils.load_config(args.config)
    er = ER(config, ElasticCache(DBPEDIA_INDEX))

    if args.query:
        res = er.retrieve(args.query)
        pprint(res)
    else:
        er.batch_retrieval()
Example #7
0
def main(args):
    conf = FileUtils.load_config(args.config)
    el = EL(conf, Entity(), ElasticCache(DBPEDIA_INDEX), FeatureCache())

    if conf.get("gen_model", False):
        LTR.train(conf)
    elif args.query:
        res = el.link(args.query)
        pprint(res)
    else:
        el.batch_linking()
Example #8
0
def main(args):
    entities = load_entities(args.input, args.th)
    mapper = FieldMapping(ElasticCache("dbpedia_2015_10_uri"), args.n)
    mappings = {}
    i = 0
    for en_id in entities:
        mappings[en_id] = mapper.map(en_id)
        i += 1
        if i % 10 == 0:
            print(i, "entities processed!")

    input_file = args.input[:args.input.rfind(".")]
    out_file = input_file + "_mapping" + ".json"
    json.dump(mappings, open(out_file, "w"), indent=4, sort_keys=True)
    print("Output file:", out_file)
Example #9
0
    def __init__(self,
                 index_name,
                 retr_model,
                 retr_params,
                 num_docs=None,
                 field="content",
                 run_id="fusion",
                 num_objs=100,
                 assoc_mode=FusionScorer.ASSOC_MODE_BINARY,
                 assoc_file=None):
        """

        :param index_name: name of index
        :param assoc_file: document-object association file
        :param assoc_mode: document-object weight mode, uniform or binary
        :param retr_model: the retrieval model; valid values: "lm", "bm25"
        :param retr_params: config including smoothing method and parameter
        :param num_objs: the number of ranked objects for a query
        :param assoc_mode: the fusion weights, which could be binary or uniform
        :param assoc_file: object-doc association file
        """
        super(LateFusionScorer, self).__init__(index_name,
                                               association_file=assoc_file,
                                               run_id=run_id)
        self.__config = {
            "index_name": self._index_name,
            "first_pass": {
                "num_docs": num_docs,
                "field": field
            },
        }
        self._field = field
        self._num_docs = num_docs
        self._model = retr_model
        self._params = retr_params
        self._assoc_mode = assoc_mode
        self._num = num_objs
        self._elastic = ElasticCache(self._index_name)
Example #10
0
class FeaturesTermStats():
    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K_VALUES = [10, 50, 100, 200, 500, 1000]
    STOPWORDS = [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
        "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having",
        "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
        "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during",
        "before", "after", "above", "below", "to", "from", "up", "down", "in",
        "out", "on", "off", "over", "under", "again", "further", "then",
        "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
        "t", "can", "will", "just", "don", "should", "now"
    ]
    MAX_K = max(K_VALUES)

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def load_termstats(self, input_file):
        """load term statistics from file"""
        self.__stats = {}
        with FileUtils.open_file_by_type(input_file) as f_in:
            rank = 0
            last_prof = None
            for line in f_in:
                prof, term, tf, df, tfidf = line.strip().split("\t")
                if prof != last_prof:
                    rank = 0
                    last_prof = prof
                rank += 1
                if term in self.STOPWORDS:  # filter stopwords
                    continue
                if term.startswith("fb_"):  # filter entity terms
                    continue
                if prof not in self.__stats:
                    self.__stats[prof] = {}
                self.__stats[prof][term] = {
                    "tf": int(tf),
                    "df": int(df),
                    "tfidf": float(tfidf),
                    "rank": rank
                }

    def get_person_tf(self, person_id):
        """
        Get aggregated TF for a person
        :param person_id: dict with TFs
        :return:
        """
        doc_ids = self.__elastic.search(person_id,
                                        self.CONTENT_FIELD,
                                        num=10000).keys()
        print(person_id, "with", len(doc_ids), "sentences")
        tf_agg = {}
        for doc_id in doc_ids:
            tv = self.__elastic.get_termvector(
                doc_id, self.CONTENT_FIELD)  # , term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
        return tf_agg, len(doc_ids)

    def generate_features(self, kb_file, output_file):
        """Generating features related to term statistics"""

        fout = open(output_file, "w")

        # write tsv header
        header = ["person", "profession"]
        for k in self.K_VALUES:
            header.append("sumProfTerms_" + str(k))
            header.append("simCos_" + str(k))
        fout.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            for line in kb_f:
                person_id, prof_id = line.strip().split("\t")
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr)
                    # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)}
                    sum_prof_terms = 0
                    for term, tf in person_tf.items():
                        pt = self.__stats.get(prof_id, {}).get(term, {})
                        if pt.get("rank",
                                  100000) > k:  # skip term if not in top-K
                            continue
                        sum_prof_terms += tf * pt.get("tfidf", 0)
                    values.append(str(sum_prof_terms))

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    vec_pr = []  # construct prof vector
                    vec_pe = []  # construct person vector

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                vec_pr.append(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                vec_pe.append(person_tf.get(term, 0) * idf)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                fout.write("\t".join(values) + "\n")
                print(values)

        fout.close()
Example #11
0
class Retrieval(object):
    """Loads config file, checks params, and sets default values.

    :param config: retrieval config (JSON config file or a dictionary) of the shape:
    ::
        {
            "index_name": name of the index,
            "first_pass": {
                "num_docs": number of documents in first-pass scoring (default: 1000)
                "field": field used in first pass retrieval (default: Elastic.FIELD_CATCHALL)
                "fields_return": comma-separated list of fields to return for each hit (default: "")
            },
            "num_docs": number of documents to return (default: 100)
            "start": starting offset for ranked documents (default:0)
            "model": name of retrieval model; accepted values: [lm, mlm, prms] (default: lm)
            "field": field name for LM (default: catchall)
            "fields": list of fields for PRMS (default: [catchall])
            "field_weights": dictionary with fields and corresponding weights for MLM (default: {catchall: 1})
            "smoothing_method": accepted values: [jm, dirichlet] (default: dirichlet)
            "smoothing_param": value of lambda or mu; accepted values: [float or "avg_len"],
                                (jm default: 0.1, dirichlet default: 2000)

            "query_file": name of query file (JSON),
            "output_file": name of output file,
            "run_id": run id for TREC output
        }
    """
    FIELDED_MODELS = {"mlm", "prms"}
    LM_MODELS = {"lm", "mlm", "prms"}

    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)

    @staticmethod
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("num_docs", None) is None:
                config["first_pass"]["num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) is None:
                config["model"] = None
            if config.get("field", None) is None:
                config["field"] = Elastic.FIELD_CATCHALL
            if config.get("fields", None) is None:
                config["fields"] = [Elastic.FIELD_CATCHALL]
            if config.get("field_weights", None) is None:
                config["field_weights"] = {Elastic.FIELD_CATCHALL: 1}
            if config["model"] in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)

    def _first_pass_scoring(self, analyzed_query):
        """Returns first-pass scoring of documents.

        :param analyzed_query: analyzed query
        :return: RetrievalResults object
        """
        print("\tFirst pass scoring... ", )
        # todo: add support for other similarities
        # body = {"query": {
        #     "bool": {
        #         "should": [
        #             {"match": {
        #                 "catchall": {
        #                     "query": analyzed_query
        #                 }}},
        #             {"match": {
        #                 "names": {
        #                     "query": analyzed_query,
        #                     "boost": 3
        #                 }}}]}}}
        # self.__elastic.update_similarity(self.__first_pass_model, self.__first_pass_model_params)
        res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs,
                                     fields_return=self.__first_pass_fields_return)
        # res1 = self.__elastic.search_complex(body=body, num=self.__first_pass_num_docs,
        #                              fields_return=self.__first_pass_fields_return)
        return res1

    def _second_pass_scoring(self, res1, scorer):
        """Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        print("\tSecond pass scoring... ", )
        res2 = {}
        for doc_id in res1.keys():
            res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})}
        print("done")
        return res2

    def retrieve(self, query, scorer=None):
        """Scores documents for the given query."""
        query = self.__elastic.analyze_query(query)

        # 1st pass retrieval
        res1 = self._first_pass_scoring(query)
        if self.__model is None:
            return res1

        # 2nd pass retrieval
        scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config)
        res2 = self._second_pass_scoring(res1, scorer)
        return res2

    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            print("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        print("Output file:", self.__output_file)

    def trec_format(self, results, query_id, max_rank=100):
        """Outputs results in TREC format"""
        out_str = ""
        rank = 1
        for doc_id, score in sorted(results.items(), key=lambda x: x[1], reverse=True):
            if rank > max_rank:
                break
            out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score) + "\t" + self.__run_id + "\n"
            rank += 1
        return out_str
Example #12
0
from nordlys.logic.entity.entity import Entity
from nordlys.logic.features.feature_cache import FeatureCache
from nordlys.services.el import EL
from nordlys.services.er import ER
from nordlys.services.tti import TTI

from nordlys.core.utils.logging_utils import RequestHandler
from nordlys.core.utils.Api_handler import API_Handler
import logging, traceback
from time import strftime
from nordlys.config import LOGGING_PATH, PLOGGER, ELASTIC_INDICES, Api_Log_Path

# Variables
DBPEDIA_INDEX = ELASTIC_INDICES[0]
__entity = Entity()
__elastic = ElasticCache(DBPEDIA_INDEX)
__fcache = FeatureCache()
app = Flask(__name__)


def error(str):
    """
    @todo complete error handling

    :param str:
    :return:
    """
    res = {"ERROR": str}
    return jsonify(**res)

Example #13
0
        mapping_probs = {}
        if denominator > 0:  # if the term is present in the collection
            for f in numerators:
                mapping_probs[f] = numerators[f] / denominator
                if self.SCORER_DEBUG:
                    print("\t\tf= {}\tt= {}\tp(f|t)= {}/{} = {}".format(
                        f, t, numerators[f], sum(numerators.values()), mapping_probs[f]))

        return mapping_probs

    def get_total_field_freq(self):
        """Returns total occurrences of all fields"""
        if self.total_field_freq is None:
            total_field_freq = 0
            for f in self._fields:
                total_field_freq += self._elastic.doc_count(f)
            self.total_field_freq = total_field_freq
        return self.total_field_freq


if __name__ == "__main__":
    query = "gonna friends"
    doc_id = "4"
    es = ElasticCache("toy_index")
    params = {"fields": "content",
              "__fields": {"title": 0.2, "content": 0.8},
              "__fields": ["content", "title"]
              }
    score = ScorerPRMS(es, query, params).score_doc(doc_id)
    print(score)
Example #14
0
class ProfStats():
    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K = 30000  # keep top-K profession terms

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)

    def gen_stats(self, prof, output_file):
        """Writes the stats into the file."""
        print("\tgetting term frequencies ...")
        tf, df = self.get_tf_agg(prof)
        # print("\tgetting document frequencies ... (", len(tf.keys()), "terms)")
        # df2 = self.get_df(tf.keys())
        print("\tcomputing tf-idf ...")
        tf_idf = self.compute_tf_idf(tf, df)

        out_str = ""
        i = 0
        for t, tfidf in sorted(tf_idf.items(),
                               key=lambda x: x[1],
                               reverse=True):
            out_str += prof + "\t" + t + "\t" + str(tf[t]) + "\t" + str(
                df[t]) + "\t" + str(tfidf) + "\n"
            i += 1
            if i == self.K:  # Only print top-k terms
                break
        open(output_file, "a").write(out_str)
        return

    def compute_tf_idf(self, tf, df):
        """Computes tf.idf = (tf/doc_len) * (log n(docs)/df)

        :param tf: dictionary of tf for all terms
        :param df: dictionary of df for all terms
        :return: dictionary of tf.idf scores
        """
        tf_idf = {}
        prof_doc_len = sum(tf.values())
        for t in tf.keys():
            normalized_tf = tf[t] / prof_doc_len
            n_docs = self.__elastic.num_docs()
            idf = math.log(n_docs / df[t])
            tf_idf[t] = normalized_tf * idf
        return tf_idf

    def get_df(self, terms):
        """Returns document frequency for all terms."""
        df = {}
        for t in terms:
            df[t] = self.__elastic.doc_freq(t, field=self.CONTENT_FIELD)
        return df

    def get_tf_agg(self, prof):
        """Given a list of ids to get all their tf_idf in a dictionary."""
        size = 1000
        tf_agg = {}
        df = {}
        # doc_ids = self.__elastic.search(prof, self.PROF_FIELD, num=size).keys()
        doc_ids = self.__elastic.search_scroll(prof,
                                               field=self.PROF_FIELD,
                                               num=size).keys()
        print(len(doc_ids), "sentences")
        for i, doc_id in enumerate(doc_ids):
            tv = self.__elastic.get_termvector(doc_id,
                                               self.CONTENT_FIELD,
                                               term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
                if t not in df:
                    df[t] = val["doc_freq"]
        return tf_agg, df
Example #15
0
class FeaturesTermStats():
    CONTENT_FIELD = "content"
    STOPWORDS = [
        "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
        "in", "into", "is", "it", "no", "not", "of", "on", "or", "such",
        "that", "the", "their", "then", "there", "these", "they", "this", "to",
        "was", "will", "with"
    ]

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def get_per_nat_tf(self, person_id, nats):
        """
        Compute freqPerNat: \frac{|\{s : pe \in s, nt \in s\}|}{|S(pe)|}
        :param person_id:
        :param nats: nationality+adj, e.g. Germany, German
        :return: freqPerNat
        """

        body = {"query": {"bool": {"must": {"term": {"content": person_id}}}}}

        doc_ids = self.__elastic.search_complex(body,
                                                self.CONTENT_FIELD,
                                                num=10000).keys()
        n_s_pe = len(doc_ids)  # number of sentences containing person
        # print(n_s_pe)
        noun = nats[0]
        noun_query = self.__elastic.analyze_query(noun)

        body = {
            "query": {
                "bool": {
                    "must": [{
                        "match": {
                            "content": person_id
                        }
                    }, {
                        "match_phrase": {
                            "content": noun_query
                        }
                    }]
                }
            }
        }

        doc_ids_noun = self.__elastic.search_complex(body,
                                                     self.CONTENT_FIELD,
                                                     num=10000).keys()
        n_co_noun = len(doc_ids_noun)
        # print("Noun", n_co_noun)
        adj = nats[1]
        adj_query = self.__elastic.analyze_query(adj)

        body = {
            "query": {
                "bool": {
                    "must": [{
                        "match": {
                            "content": person_id
                        }
                    }, {
                        "match_phrase": {
                            "content": adj_query
                        }
                    }]
                }
            }
        }
        doc_ids_adj = self.__elastic.search_complex(body,
                                                    self.CONTENT_FIELD,
                                                    num=10000).keys()
        n_co_adj = len(doc_ids_adj)
        # print("Adj", n_co_adj)

        if n_s_pe == 0:
            return 0.0, 0.0
        else:
            return n_co_noun / n_s_pe, n_co_adj / n_s_pe

    def generate_features(self, kb_file, output_file1, output_file2):
        """Generate features of freq-person-nationality"""

        fout1 = open(output_file1, "w")
        fout2 = open(output_file2, "w")

        # write tsv header
        header = ["person", "nationality", "freq_person_nationality_noun"]
        fout1.write("\t".join(header) + "\n")
        header = ["person", "nationality", "freq_person_nationality_adj"]
        fout2.write("\t".join(header) + "\n")

        with FileUtils.open_file_by_type(kb_file) as kb_f:
            line_count = 1
            for line in kb_f:
                print(line_count)
                line_count += 1
                person_id, nat_id, noun, adj = line.strip().split("\t")
                values_noun = [person_id, nat_id]
                values_adj = [person_id, nat_id]
                nats = [noun, adj]
                fpn_noun, fpn_adj = self.get_per_nat_tf(person_id, nats)
                values_noun.append(str(fpn_noun))
                values_adj.append(str(fpn_adj))
                fout1.write("\t".join(values_noun) + "\n")
                fout2.write("\t".join(values_adj) + "\n")
        fout1.close()
        fout2.close()
Example #16
0
 def __init__(self, index_name=WP_ST_INDEX_ID):
     self.__elastic = ElasticCache(index_name)
     self.__stats = None
Example #17
0
class FeaturesW2VSim(object):
    """Implements our simCosW2V feature, i.e., the cosine similarity between the profession and person vectors, \
    where the profession (resp. person) vector is the centroid of TFIDF-weighted word2vec vectors of top-K profession \
    (resp. person) terms.
    """

    # Formula for feature computation:
    #
    # $cos(\vec{t}^{w2v}_{pe, k}, \vec{t}^{w2v}_{pr, k})$, where for item $\in \{pe, pr\}$:
    # $$\vec{t}^{w2v}_{item, k} = \sum_{t \in T_k(item)} w(t, item) w2v(t)$$
    # (note that using these unnormalized sums in the computation of $cos()$ is equivalent to use the actual centroids).

    CONTENT_FIELD = "content"
    PROF_FIELD = "professions"
    K_VALUES = [10, 50, 100, 200, 500, 1000]
    MAX_K = max(K_VALUES)

    def __init__(self, index_name=WP_ST_INDEX_ID):
        self.__elastic = ElasticCache(index_name)
        self.__stats = None

    def load_termstats(self, input_file):
        self.__stats = {}
        with FileUtils.open_file_by_type(input_file) as f_in:
            rank = 0
            last_prof = None
            for line in f_in:
                prof, term, tf, df, tfidf = line.strip().split("\t")
                if prof != last_prof:
                    rank = 0
                    last_prof = prof
                rank += 1
                if term in STOPWORDS:  # filter stopwords
                    continue
                if term.startswith("fb_"):  # filter entity terms
                    continue
                if prof not in self.__stats:
                    self.__stats[prof] = {}
                self.__stats[prof][term] = {
                    "tf": int(tf),
                    "df": int(df),
                    "tfidf": float(tfidf),
                    "rank": rank
                }

    def get_person_tf(self, person_id):
        """Get aggregated TF for a person.

        :param person_id: dict with TFs.
        :return:
        """
        doc_ids = self.__elastic.search(person_id,
                                        self.CONTENT_FIELD,
                                        num=10000).keys()

        tf_agg = {}
        for doc_id in doc_ids:
            tv = self.__elastic.get_termvector(
                doc_id, self.CONTENT_FIELD)  # , term_stats=True)
            for t, val in tv.items():
                tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"]
        return tf_agg, len(doc_ids)

    def generate_features(self, kb_file, output_file):
        """Core function for generating into output_file the features, with person-item data from kb_file.

        :param kb_file: path to the file with person items (a '.kb'-extension file).
        :param output_file:
        :return:
        """
        feat_w2v_approx = FeaturesW2VSimApprox()

        with open(output_file, "w") as f_out:
            # write tsv header
            header = ["person_id", "prof_id"]
            for k in self.K_VALUES:
                header.append("simCos_w2v_" + str(k))
            f_out.write("\t".join(header) + "\n")

            for line in FileUtils.read_file_as_list(kb_file):
                person_id, prof_id = line.split(
                    "\t")  # strip() done in read_file_as_list()
                values = [person_id, prof_id]

                person_tf, num_sent = self.get_person_tf(person_id)

                for k in self.K_VALUES:
                    # we take top-K profession terms

                    # compute simCosK
                    # where K is the top-K terms for the profession
                    term_weights_pr = {
                    }  # dict from top-K profession terms to their tfidf weights
                    term_weights_pe = {
                    }  # dict from top-K person terms to their tfidf weights

                    if prof_id in self.__stats:
                        for term, s in self.__stats[prof_id].items():
                            if s["rank"] <= k:
                                term_weights_pr[term] = float(s["tfidf"])
                                idf = s["tfidf"] / s[
                                    "tf"]  # we back-generate IDF from profession's TF-IDF
                                term_weights_pe[term] = person_tf.get(term,
                                                                      0) * idf

                        vec_pr = feat_w2v_approx.get_vector(term_weights_pr)
                        vec_pe = feat_w2v_approx.get_vector(term_weights_pe)
                        cos = cos_sim(vec_pr, vec_pe)
                    else:
                        cos = 0  # in some exceptional cases the profession does not have any sentences
                    values.append(str(cos))

                f_out.write("\t".join(values) + "\n")
Example #18
0
class Retrieval(object):
    FIELDED_MODELS = {"mlm", "prms"}
    LM_MODELS = {"lm", "mlm", "prms"}

    def __init__(self, config):
        self.check_config(config)
        self.__config = config
        self.__index_name = config["index_name"]
        self.__first_pass_num_docs = int(config["first_pass"]["1st_num_docs"])
        self.__first_pass_field = config["first_pass"]["field"]
        self.__first_pass_fields_return = config["first_pass"]["fields_return"]
        self.__first_pass_model = config["first_pass"]["model"]
        self.__start = int(config["start"])
        self.__model = config.get("model", None)
        self.__num_docs = int(config.get("num_docs", None))
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)
        self.__run_id = config.get("run_id", self.__model)

        self.__elastic = ElasticCache(self.__index_name)

    @staticmethod
    def check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if config.get("index_name", None) is None:
                raise Exception("index_name is missing")

            # Checks first pass parameters
            if config.get("first_pass", None) is None:
                config["first_pass"] = {}
            if config["first_pass"].get("1st_num_docs", None) is None:
                config["first_pass"]["1st_num_docs"] = 1000
            if config["first_pass"].get("field", None) is None:
                config["first_pass"]["field"] = Elastic.FIELD_CATCHALL
            if config["first_pass"].get("fields_return", None) is None:
                config["first_pass"]["fields_return"] = ""
            if config["first_pass"].get("model", None) is None:
                config["first_pass"]["model"] = Elastic.BM25

            if config.get("start", None) is None:
                config["start"] = 0
            if config.get("num_docs", None) is None:
                config["num_docs"] = 100

            if config.get("model", None) in Retrieval.LM_MODELS:
                if config.get("smoothing_method", None) is None:
                    config["smoothing_method"] = ScorerLM.DIRICHLET
                if config.get("smoothing_param", None) is None:
                    if config["smoothing_method"] == ScorerLM.DIRICHLET:
                        config["smoothing_param"] = 2000
                    elif config["smoothing_method"] == ScorerLM.JM:
                        config["smoothing_param"] = 0.1
                    else:
                        raise Exception("Smoothing method is not supported.")

            if config.get("model", None) == "lm":
                if config.get("fields", None) is None:
                    config["fields"] = Elastic.FIELD_CATCHALL
            if config.get("model", None) == "mlm":
                if config.get("fields", None) is None:
                    config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8}
            if config.get("model", None) == "prms":
                if config.get("fields", None) is None:
                    config["fields"] = [Elastic.FIELD_CATCHALL]
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            sys.exit(1)

    def __get_fields(self):
        """Returns the name of all fields that will be used in the retrieval model."""
        fields = []
        if type(self.__config["fields"]) == str:
            fields.append(self.__config["fields"])
        elif type(self.__config["fields"]) == dict:
            fields = self.__config["fields"].keys()
        else:
            fields = self.__config["fields"]
        return fields


    def _first_pass_scoring(self, analyzed_query):
        """Returns first-pass scoring of documents.

        :param analyzed_query: analyzed query
        :return: RetrievalResults object
        """
        PLOGGER.debug("\tFirst pass scoring... ", )
        res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs,
                                     fields_return=self.__first_pass_fields_return)
        return res1

    def _second_pass_scoring(self, res1, scorer):
        """Returns second-pass scoring of documents.

        :param res1: first pass results
        :param scorer: scorer object
        :return: RetrievalResults object
        """
        PLOGGER.debug("\tSecond pass scoring... ", )
        for field in self.__get_fields():
            self.__elastic.multi_termvector(list(res1.keys()), field)

        res2 = {}
        for doc_id in res1.keys():
            res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})}
        PLOGGER.debug("done")
        return res2

    def retrieve(self, query, scorer=None):
        """Scores documents for the given query."""
        query = self.__elastic.analyze_query(query)

        # 1st pass retrieval
        res1 = self._first_pass_scoring(query)
        if self.__model == "bm25":
            return res1

        # 2nd pass retrieval
        scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config)
        res2 = self._second_pass_scoring(res1, scorer)
        return res2

    def batch_retrieval(self):
        """Scores queries in a batch and outputs results."""
        queries = json.load(open(self.__query_file))

        # init output file
        open(self.__output_file, "w").write("")
        out = open(self.__output_file, "w")

        # retrieves documents
        for query_id in sorted(queries):
            PLOGGER.info("scoring [" + query_id + "] " + queries[query_id])
            results = self.retrieve(queries[query_id])
            out.write(self.trec_format(results, query_id, self.__num_docs))
        out.close()
        PLOGGER.info("Output file:" + self.__output_file)

    def trec_format(self, results, query_id, max_rank=100):
        """Outputs results in TREC format"""
        out_str = ""
        rank = 1
        for doc_id, score in sorted(results.items(), key=lambda x: x[1]["score"], reverse=True):
            if rank > max_rank:
                break
            out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score["score"]) + "\t" + self.__run_id + "\n"
            rank += 1
        return out_str
Example #19
0
 def __init__(self, query, retrieval_config):
     self.__query = query
     self.__retrieval_config = retrieval_config
     self.__elasttic = ElasticCache(TC_INDEX)