class FeaturesTermStats(): CONTENT_FIELD = "content" PROF_FIELD = "professions" K_VALUES = [10, 50, 100, 200, 500, 1000] STOPWORDS = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] MAX_K = max(K_VALUES) def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None def load_termstats(self, input_file): """load term statistics from file""" self.__stats = {} with FileUtils.open_file_by_type(input_file) as f_in: rank = 0 last_prof = None for line in f_in: prof, term, tf, df, tfidf = line.strip().split("\t") if prof != last_prof: rank = 0 last_prof = prof rank += 1 if term in self.STOPWORDS: # filter stopwords continue if term.startswith("fb_"): # filter entity terms continue if prof not in self.__stats: self.__stats[prof] = {} self.__stats[prof][term] = { "tf": int(tf), "df": int(df), "tfidf": float(tfidf), "rank": rank } def get_person_tf(self, person_id): """ Get aggregated TF for a person :param person_id: dict with TFs :return: """ doc_ids = self.__elastic.search(person_id, self.CONTENT_FIELD, num=10000).keys() print(person_id, "with", len(doc_ids), "sentences") tf_agg = {} for doc_id in doc_ids: tv = self.__elastic.get_termvector( doc_id, self.CONTENT_FIELD) # , term_stats=True) for t, val in tv.items(): tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"] return tf_agg, len(doc_ids) def generate_features(self, kb_file, output_file): """Generating features related to term statistics""" fout = open(output_file, "w") # write tsv header header = ["person", "profession"] for k in self.K_VALUES: header.append("sumProfTerms_" + str(k)) header.append("simCos_" + str(k)) fout.write("\t".join(header) + "\n") with FileUtils.open_file_by_type(kb_file) as kb_f: for line in kb_f: person_id, prof_id = line.strip().split("\t") values = [person_id, prof_id] person_tf, num_sent = self.get_person_tf(person_id) for k in self.K_VALUES: # we take top-K profession terms # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr) # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)} sum_prof_terms = 0 for term, tf in person_tf.items(): pt = self.__stats.get(prof_id, {}).get(term, {}) if pt.get("rank", 100000) > k: # skip term if not in top-K continue sum_prof_terms += tf * pt.get("tfidf", 0) values.append(str(sum_prof_terms)) # compute simCosK # where K is the top-K terms for the profession vec_pr = [] # construct prof vector vec_pe = [] # construct person vector if prof_id in self.__stats: for term, s in self.__stats[prof_id].items(): if s["rank"] <= k: vec_pr.append(s["tfidf"]) idf = s["tfidf"] / s[ "tf"] # we back-generate IDF from profession's TF-IDF vec_pe.append(person_tf.get(term, 0) * idf) cos = cos_sim(vec_pr, vec_pe) else: cos = 0 # in some exceptional cases the profession does not have any sentences values.append(str(cos)) fout.write("\t".join(values) + "\n") print(values) fout.close()
class Retrieval(object): FIELDED_MODELS = {"mlm", "prms"} LM_MODELS = {"lm", "mlm", "prms"} def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["1st_num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name) @staticmethod def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("1st_num_docs", None) is None: config["first_pass"]["1st_num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") if config.get("model", None) == "lm": if config.get("fields", None) is None: config["fields"] = Elastic.FIELD_CATCHALL if config.get("model", None) == "mlm": if config.get("fields", None) is None: config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8} if config.get("model", None) == "prms": if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1) def __get_fields(self): """Returns the name of all fields that will be used in the retrieval model.""" fields = [] if type(self.__config["fields"]) == str: fields.append(self.__config["fields"]) elif type(self.__config["fields"]) == dict: fields = self.__config["fields"].keys() else: fields = self.__config["fields"] return fields def _first_pass_scoring(self, analyzed_query): """Returns first-pass scoring of documents. :param analyzed_query: analyzed query :return: RetrievalResults object """ PLOGGER.debug("\tFirst pass scoring... ", ) res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs, fields_return=self.__first_pass_fields_return) return res1 def _second_pass_scoring(self, res1, scorer): """Returns second-pass scoring of documents. :param res1: first pass results :param scorer: scorer object :return: RetrievalResults object """ PLOGGER.debug("\tSecond pass scoring... ", ) for field in self.__get_fields(): self.__elastic.multi_termvector(list(res1.keys()), field) res2 = {} for doc_id in res1.keys(): res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})} PLOGGER.debug("done") return res2 def retrieve(self, query, scorer=None): """Scores documents for the given query.""" query = self.__elastic.analyze_query(query) # 1st pass retrieval res1 = self._first_pass_scoring(query) if self.__model == "bm25": return res1 # 2nd pass retrieval scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config) res2 = self._second_pass_scoring(res1, scorer) return res2 def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): PLOGGER.info("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() PLOGGER.info("Output file:" + self.__output_file) def trec_format(self, results, query_id, max_rank=100): """Outputs results in TREC format""" out_str = "" rank = 1 for doc_id, score in sorted(results.items(), key=lambda x: x[1]["score"], reverse=True): if rank > max_rank: break out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score["score"]) + "\t" + self.__run_id + "\n" rank += 1 return out_str
class Retrieval(object): """Loads config file, checks params, and sets default values. :param config: retrieval config (JSON config file or a dictionary) of the shape: :: { "index_name": name of the index, "first_pass": { "num_docs": number of documents in first-pass scoring (default: 1000) "field": field used in first pass retrieval (default: Elastic.FIELD_CATCHALL) "fields_return": comma-separated list of fields to return for each hit (default: "") }, "num_docs": number of documents to return (default: 100) "start": starting offset for ranked documents (default:0) "model": name of retrieval model; accepted values: [lm, mlm, prms] (default: lm) "field": field name for LM (default: catchall) "fields": list of fields for PRMS (default: [catchall]) "field_weights": dictionary with fields and corresponding weights for MLM (default: {catchall: 1}) "smoothing_method": accepted values: [jm, dirichlet] (default: dirichlet) "smoothing_param": value of lambda or mu; accepted values: [float or "avg_len"], (jm default: 0.1, dirichlet default: 2000) "query_file": name of query file (JSON), "output_file": name of output file, "run_id": run id for TREC output } """ FIELDED_MODELS = {"mlm", "prms"} LM_MODELS = {"lm", "mlm", "prms"} def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name) @staticmethod def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("num_docs", None) is None: config["first_pass"]["num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) is None: config["model"] = None if config.get("field", None) is None: config["field"] = Elastic.FIELD_CATCHALL if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] if config.get("field_weights", None) is None: config["field_weights"] = {Elastic.FIELD_CATCHALL: 1} if config["model"] in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") except Exception as e: print("Error in config file: ", e) sys.exit(1) def _first_pass_scoring(self, analyzed_query): """Returns first-pass scoring of documents. :param analyzed_query: analyzed query :return: RetrievalResults object """ print("\tFirst pass scoring... ", ) # todo: add support for other similarities # body = {"query": { # "bool": { # "should": [ # {"match": { # "catchall": { # "query": analyzed_query # }}}, # {"match": { # "names": { # "query": analyzed_query, # "boost": 3 # }}}]}}} # self.__elastic.update_similarity(self.__first_pass_model, self.__first_pass_model_params) res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs, fields_return=self.__first_pass_fields_return) # res1 = self.__elastic.search_complex(body=body, num=self.__first_pass_num_docs, # fields_return=self.__first_pass_fields_return) return res1 def _second_pass_scoring(self, res1, scorer): """Returns second-pass scoring of documents. :param res1: first pass results :param scorer: scorer object :return: RetrievalResults object """ print("\tSecond pass scoring... ", ) res2 = {} for doc_id in res1.keys(): res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})} print("done") return res2 def retrieve(self, query, scorer=None): """Scores documents for the given query.""" query = self.__elastic.analyze_query(query) # 1st pass retrieval res1 = self._first_pass_scoring(query) if self.__model is None: return res1 # 2nd pass retrieval scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config) res2 = self._second_pass_scoring(res1, scorer) return res2 def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): print("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() print("Output file:", self.__output_file) def trec_format(self, results, query_id, max_rank=100): """Outputs results in TREC format""" out_str = "" rank = 1 for doc_id, score in sorted(results.items(), key=lambda x: x[1], reverse=True): if rank > max_rank: break out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score) + "\t" + self.__run_id + "\n" rank += 1 return out_str
class FeaturesW2VSim(object): """Implements our simCosW2V feature, i.e., the cosine similarity between the profession and person vectors, \ where the profession (resp. person) vector is the centroid of TFIDF-weighted word2vec vectors of top-K profession \ (resp. person) terms. """ # Formula for feature computation: # # $cos(\vec{t}^{w2v}_{pe, k}, \vec{t}^{w2v}_{pr, k})$, where for item $\in \{pe, pr\}$: # $$\vec{t}^{w2v}_{item, k} = \sum_{t \in T_k(item)} w(t, item) w2v(t)$$ # (note that using these unnormalized sums in the computation of $cos()$ is equivalent to use the actual centroids). CONTENT_FIELD = "content" PROF_FIELD = "professions" K_VALUES = [10, 50, 100, 200, 500, 1000] MAX_K = max(K_VALUES) def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None def load_termstats(self, input_file): self.__stats = {} with FileUtils.open_file_by_type(input_file) as f_in: rank = 0 last_prof = None for line in f_in: prof, term, tf, df, tfidf = line.strip().split("\t") if prof != last_prof: rank = 0 last_prof = prof rank += 1 if term in STOPWORDS: # filter stopwords continue if term.startswith("fb_"): # filter entity terms continue if prof not in self.__stats: self.__stats[prof] = {} self.__stats[prof][term] = { "tf": int(tf), "df": int(df), "tfidf": float(tfidf), "rank": rank } def get_person_tf(self, person_id): """Get aggregated TF for a person. :param person_id: dict with TFs. :return: """ doc_ids = self.__elastic.search(person_id, self.CONTENT_FIELD, num=10000).keys() tf_agg = {} for doc_id in doc_ids: tv = self.__elastic.get_termvector( doc_id, self.CONTENT_FIELD) # , term_stats=True) for t, val in tv.items(): tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"] return tf_agg, len(doc_ids) def generate_features(self, kb_file, output_file): """Core function for generating into output_file the features, with person-item data from kb_file. :param kb_file: path to the file with person items (a '.kb'-extension file). :param output_file: :return: """ feat_w2v_approx = FeaturesW2VSimApprox() with open(output_file, "w") as f_out: # write tsv header header = ["person_id", "prof_id"] for k in self.K_VALUES: header.append("simCos_w2v_" + str(k)) f_out.write("\t".join(header) + "\n") for line in FileUtils.read_file_as_list(kb_file): person_id, prof_id = line.split( "\t") # strip() done in read_file_as_list() values = [person_id, prof_id] person_tf, num_sent = self.get_person_tf(person_id) for k in self.K_VALUES: # we take top-K profession terms # compute simCosK # where K is the top-K terms for the profession term_weights_pr = { } # dict from top-K profession terms to their tfidf weights term_weights_pe = { } # dict from top-K person terms to their tfidf weights if prof_id in self.__stats: for term, s in self.__stats[prof_id].items(): if s["rank"] <= k: term_weights_pr[term] = float(s["tfidf"]) idf = s["tfidf"] / s[ "tf"] # we back-generate IDF from profession's TF-IDF term_weights_pe[term] = person_tf.get(term, 0) * idf vec_pr = feat_w2v_approx.get_vector(term_weights_pr) vec_pe = feat_w2v_approx.get_vector(term_weights_pe) cos = cos_sim(vec_pr, vec_pe) else: cos = 0 # in some exceptional cases the profession does not have any sentences values.append(str(cos)) f_out.write("\t".join(values) + "\n")