def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name)
def __type_centric(self, query): """Type-centric TTI. :param query: query string :type query: str """ types = dict() model = self.__config.get("model", TTI_MODEL_BM25) elastic = ElasticCache( self.__tc_config.get("index", DEFAULT_TTI_TC_INDEX)) if model == TTI_MODEL_BM25: PLOGGER.info("TTI, TC, BM25") self.__tc_config["model"] = "bm25" # scorer = Scorer.get_scorer(elastic, query, self.__tc_config) types = Retrieval(self.__tc_config).retrieve(query) elif model == TTI_MODEL_LM: PLOGGER.debug("TTI, TC, LM") self.__tc_config["model"] = "lm" # Needed for 2nd-pass self.__tc_config["field"] = "content" # Needed for 2nd-pass self.__tc_config["second_pass"] = {"field": "content"} for param in ["smoothing_method", "smoothing_param"]: if self.__config.get(param, None) is not None: self.__tc_config["second_pass"][param] = self.__config.get( param) scorer = Scorer.get_scorer(elastic, query, self.__tc_config) types = Retrieval(self.__tc_config).retrieve(query, scorer) PLOGGER.info(types) return types
def gen_train_set(gt, query_file, train_set): """Trains LTR model for entity linking.""" entity, elastic, fcache = Entity(), ElasticCache( ELASTIC_INDICES[0]), FeatureCache() inss = Instances() positive_annots = set() # Adds groundtruth instances (positive instances) PLOGGER.info("Adding groundtruth instances (positive instances) ....") for item in sorted(gt): # qid, query, en_id, mention ltr = LTR(Query(item[1], item[0]), entity, elastic, fcache) ins = ltr.__gen_raw_ins(item[2], item[3]) ins.features = ltr.get_features(ins) ins.target = 1 inss.add_instance(ins) positive_annots.add((item[0], item[2])) # Adds all other instances PLOGGER.info("Adding all other instances (negative instances) ...") for qid, q in sorted(json.load(open(query_file, "r")).items()): PLOGGER.info("Query [" + qid + "]") ltr = LTR(Query(q, qid), entity, elastic, fcache) q_inss = ltr.get_candidate_inss() for ins in q_inss.get_all(): if (qid, ins.get_property("en_id")) in positive_annots: continue ins.target = 0 inss.add_instance(ins) inss.to_json(train_set)
def main(args): config = FileUtils.load_config(args.config) elastic_term = ElasticCache(config["text_index"]) lambdas = config.get("lambdas", [0.9, 0.1]) queries = json.load(open(config["query_file"], "r")) mappings = json.load(open(config["mapping_file"], "r")) annots = load_annot(config["annot_file"]) run = load_run(config["run_file"]) instances = Instances() # gets the results out_file = open(config["output_file"], "w") qid_int = 0 for qid, query in sorted(queries.items()): print("Scoring ", qid, "...") results, libsvm_str = {}, "" query_len = len(elastic_term.analyze_query(query).split()) scorer = ScorerELR(ElasticCache(config["uri_index"]), annots[qid], query_len, lambdas) for doc_id, p_T_d in sorted(run[qid].items()): query_mappings = get_mapping_query(annots[qid], mappings) p_E_d = scorer.score_doc(doc_id, query_mappings) properties = { 'doc_id': doc_id, 'query': query, 'qid': qid, 'qid_int': qid_int } features = {'p_T_d': p_T_d, 'p_E_d': p_E_d} ins = Instance(qid + "_" + doc_id, features=features, properties=properties) instances.add_instance(ins) # libsvm_str += ins.to_libsvm(qid_prop="qod_int") results[doc_id] = (lambdas[0] * p_T_d) + (lambdas[1] * p_E_d) qid_int += 1 # Write trec format out_str = trec_format(results, qid, "elr") out_file.write(out_str) out_file.close() print("Output file:", config["output_file"]) instances.to_json(config["json_file"]) print("Output file:", config["json_file"])
def __get_scorer(self, query): """Factory method to get entity retrieval method.""" model = self.__config.get("model", None) if model == "elr": scorer = ELR(self.__config) else: # from core.retrieval elastic = ElasticCache(self.__config["index_name"]) scorer = Scorer.get_scorer(elastic, query, self.__config) return scorer
def main(args): config = FileUtils.load_config(args.config) er = ER(config, ElasticCache(DBPEDIA_INDEX)) if args.query: res = er.retrieve(args.query) pprint(res) else: er.batch_retrieval()
def main(args): conf = FileUtils.load_config(args.config) el = EL(conf, Entity(), ElasticCache(DBPEDIA_INDEX), FeatureCache()) if conf.get("gen_model", False): LTR.train(conf) elif args.query: res = el.link(args.query) pprint(res) else: el.batch_linking()
def main(args): entities = load_entities(args.input, args.th) mapper = FieldMapping(ElasticCache("dbpedia_2015_10_uri"), args.n) mappings = {} i = 0 for en_id in entities: mappings[en_id] = mapper.map(en_id) i += 1 if i % 10 == 0: print(i, "entities processed!") input_file = args.input[:args.input.rfind(".")] out_file = input_file + "_mapping" + ".json" json.dump(mappings, open(out_file, "w"), indent=4, sort_keys=True) print("Output file:", out_file)
def __init__(self, index_name, retr_model, retr_params, num_docs=None, field="content", run_id="fusion", num_objs=100, assoc_mode=FusionScorer.ASSOC_MODE_BINARY, assoc_file=None): """ :param index_name: name of index :param assoc_file: document-object association file :param assoc_mode: document-object weight mode, uniform or binary :param retr_model: the retrieval model; valid values: "lm", "bm25" :param retr_params: config including smoothing method and parameter :param num_objs: the number of ranked objects for a query :param assoc_mode: the fusion weights, which could be binary or uniform :param assoc_file: object-doc association file """ super(LateFusionScorer, self).__init__(index_name, association_file=assoc_file, run_id=run_id) self.__config = { "index_name": self._index_name, "first_pass": { "num_docs": num_docs, "field": field }, } self._field = field self._num_docs = num_docs self._model = retr_model self._params = retr_params self._assoc_mode = assoc_mode self._num = num_objs self._elastic = ElasticCache(self._index_name)
class FeaturesTermStats(): CONTENT_FIELD = "content" PROF_FIELD = "professions" K_VALUES = [10, 50, 100, 200, 500, 1000] STOPWORDS = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] MAX_K = max(K_VALUES) def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None def load_termstats(self, input_file): """load term statistics from file""" self.__stats = {} with FileUtils.open_file_by_type(input_file) as f_in: rank = 0 last_prof = None for line in f_in: prof, term, tf, df, tfidf = line.strip().split("\t") if prof != last_prof: rank = 0 last_prof = prof rank += 1 if term in self.STOPWORDS: # filter stopwords continue if term.startswith("fb_"): # filter entity terms continue if prof not in self.__stats: self.__stats[prof] = {} self.__stats[prof][term] = { "tf": int(tf), "df": int(df), "tfidf": float(tfidf), "rank": rank } def get_person_tf(self, person_id): """ Get aggregated TF for a person :param person_id: dict with TFs :return: """ doc_ids = self.__elastic.search(person_id, self.CONTENT_FIELD, num=10000).keys() print(person_id, "with", len(doc_ids), "sentences") tf_agg = {} for doc_id in doc_ids: tv = self.__elastic.get_termvector( doc_id, self.CONTENT_FIELD) # , term_stats=True) for t, val in tv.items(): tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"] return tf_agg, len(doc_ids) def generate_features(self, kb_file, output_file): """Generating features related to term statistics""" fout = open(output_file, "w") # write tsv header header = ["person", "profession"] for k in self.K_VALUES: header.append("sumProfTerms_" + str(k)) header.append("simCos_" + str(k)) fout.write("\t".join(header) + "\n") with FileUtils.open_file_by_type(kb_file) as kb_f: for line in kb_f: person_id, prof_id = line.strip().split("\t") values = [person_id, prof_id] person_tf, num_sent = self.get_person_tf(person_id) for k in self.K_VALUES: # we take top-K profession terms # Compute sumProfTerms: \sum_{t \in T_k(pr)}\sum_{s \in S(pe)} tf(t,s) w(t,pr) # where w(t,pe )= TFIDF(t,pr) = \frac{\sum_{s \in S(pr)} tf(t,s)} sum_prof_terms = 0 for term, tf in person_tf.items(): pt = self.__stats.get(prof_id, {}).get(term, {}) if pt.get("rank", 100000) > k: # skip term if not in top-K continue sum_prof_terms += tf * pt.get("tfidf", 0) values.append(str(sum_prof_terms)) # compute simCosK # where K is the top-K terms for the profession vec_pr = [] # construct prof vector vec_pe = [] # construct person vector if prof_id in self.__stats: for term, s in self.__stats[prof_id].items(): if s["rank"] <= k: vec_pr.append(s["tfidf"]) idf = s["tfidf"] / s[ "tf"] # we back-generate IDF from profession's TF-IDF vec_pe.append(person_tf.get(term, 0) * idf) cos = cos_sim(vec_pr, vec_pe) else: cos = 0 # in some exceptional cases the profession does not have any sentences values.append(str(cos)) fout.write("\t".join(values) + "\n") print(values) fout.close()
class Retrieval(object): """Loads config file, checks params, and sets default values. :param config: retrieval config (JSON config file or a dictionary) of the shape: :: { "index_name": name of the index, "first_pass": { "num_docs": number of documents in first-pass scoring (default: 1000) "field": field used in first pass retrieval (default: Elastic.FIELD_CATCHALL) "fields_return": comma-separated list of fields to return for each hit (default: "") }, "num_docs": number of documents to return (default: 100) "start": starting offset for ranked documents (default:0) "model": name of retrieval model; accepted values: [lm, mlm, prms] (default: lm) "field": field name for LM (default: catchall) "fields": list of fields for PRMS (default: [catchall]) "field_weights": dictionary with fields and corresponding weights for MLM (default: {catchall: 1}) "smoothing_method": accepted values: [jm, dirichlet] (default: dirichlet) "smoothing_param": value of lambda or mu; accepted values: [float or "avg_len"], (jm default: 0.1, dirichlet default: 2000) "query_file": name of query file (JSON), "output_file": name of output file, "run_id": run id for TREC output } """ FIELDED_MODELS = {"mlm", "prms"} LM_MODELS = {"lm", "mlm", "prms"} def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name) @staticmethod def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("num_docs", None) is None: config["first_pass"]["num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) is None: config["model"] = None if config.get("field", None) is None: config["field"] = Elastic.FIELD_CATCHALL if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] if config.get("field_weights", None) is None: config["field_weights"] = {Elastic.FIELD_CATCHALL: 1} if config["model"] in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") except Exception as e: print("Error in config file: ", e) sys.exit(1) def _first_pass_scoring(self, analyzed_query): """Returns first-pass scoring of documents. :param analyzed_query: analyzed query :return: RetrievalResults object """ print("\tFirst pass scoring... ", ) # todo: add support for other similarities # body = {"query": { # "bool": { # "should": [ # {"match": { # "catchall": { # "query": analyzed_query # }}}, # {"match": { # "names": { # "query": analyzed_query, # "boost": 3 # }}}]}}} # self.__elastic.update_similarity(self.__first_pass_model, self.__first_pass_model_params) res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs, fields_return=self.__first_pass_fields_return) # res1 = self.__elastic.search_complex(body=body, num=self.__first_pass_num_docs, # fields_return=self.__first_pass_fields_return) return res1 def _second_pass_scoring(self, res1, scorer): """Returns second-pass scoring of documents. :param res1: first pass results :param scorer: scorer object :return: RetrievalResults object """ print("\tSecond pass scoring... ", ) res2 = {} for doc_id in res1.keys(): res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})} print("done") return res2 def retrieve(self, query, scorer=None): """Scores documents for the given query.""" query = self.__elastic.analyze_query(query) # 1st pass retrieval res1 = self._first_pass_scoring(query) if self.__model is None: return res1 # 2nd pass retrieval scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config) res2 = self._second_pass_scoring(res1, scorer) return res2 def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): print("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() print("Output file:", self.__output_file) def trec_format(self, results, query_id, max_rank=100): """Outputs results in TREC format""" out_str = "" rank = 1 for doc_id, score in sorted(results.items(), key=lambda x: x[1], reverse=True): if rank > max_rank: break out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score) + "\t" + self.__run_id + "\n" rank += 1 return out_str
from nordlys.logic.entity.entity import Entity from nordlys.logic.features.feature_cache import FeatureCache from nordlys.services.el import EL from nordlys.services.er import ER from nordlys.services.tti import TTI from nordlys.core.utils.logging_utils import RequestHandler from nordlys.core.utils.Api_handler import API_Handler import logging, traceback from time import strftime from nordlys.config import LOGGING_PATH, PLOGGER, ELASTIC_INDICES, Api_Log_Path # Variables DBPEDIA_INDEX = ELASTIC_INDICES[0] __entity = Entity() __elastic = ElasticCache(DBPEDIA_INDEX) __fcache = FeatureCache() app = Flask(__name__) def error(str): """ @todo complete error handling :param str: :return: """ res = {"ERROR": str} return jsonify(**res)
mapping_probs = {} if denominator > 0: # if the term is present in the collection for f in numerators: mapping_probs[f] = numerators[f] / denominator if self.SCORER_DEBUG: print("\t\tf= {}\tt= {}\tp(f|t)= {}/{} = {}".format( f, t, numerators[f], sum(numerators.values()), mapping_probs[f])) return mapping_probs def get_total_field_freq(self): """Returns total occurrences of all fields""" if self.total_field_freq is None: total_field_freq = 0 for f in self._fields: total_field_freq += self._elastic.doc_count(f) self.total_field_freq = total_field_freq return self.total_field_freq if __name__ == "__main__": query = "gonna friends" doc_id = "4" es = ElasticCache("toy_index") params = {"fields": "content", "__fields": {"title": 0.2, "content": 0.8}, "__fields": ["content", "title"] } score = ScorerPRMS(es, query, params).score_doc(doc_id) print(score)
class ProfStats(): CONTENT_FIELD = "content" PROF_FIELD = "professions" K = 30000 # keep top-K profession terms def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) def gen_stats(self, prof, output_file): """Writes the stats into the file.""" print("\tgetting term frequencies ...") tf, df = self.get_tf_agg(prof) # print("\tgetting document frequencies ... (", len(tf.keys()), "terms)") # df2 = self.get_df(tf.keys()) print("\tcomputing tf-idf ...") tf_idf = self.compute_tf_idf(tf, df) out_str = "" i = 0 for t, tfidf in sorted(tf_idf.items(), key=lambda x: x[1], reverse=True): out_str += prof + "\t" + t + "\t" + str(tf[t]) + "\t" + str( df[t]) + "\t" + str(tfidf) + "\n" i += 1 if i == self.K: # Only print top-k terms break open(output_file, "a").write(out_str) return def compute_tf_idf(self, tf, df): """Computes tf.idf = (tf/doc_len) * (log n(docs)/df) :param tf: dictionary of tf for all terms :param df: dictionary of df for all terms :return: dictionary of tf.idf scores """ tf_idf = {} prof_doc_len = sum(tf.values()) for t in tf.keys(): normalized_tf = tf[t] / prof_doc_len n_docs = self.__elastic.num_docs() idf = math.log(n_docs / df[t]) tf_idf[t] = normalized_tf * idf return tf_idf def get_df(self, terms): """Returns document frequency for all terms.""" df = {} for t in terms: df[t] = self.__elastic.doc_freq(t, field=self.CONTENT_FIELD) return df def get_tf_agg(self, prof): """Given a list of ids to get all their tf_idf in a dictionary.""" size = 1000 tf_agg = {} df = {} # doc_ids = self.__elastic.search(prof, self.PROF_FIELD, num=size).keys() doc_ids = self.__elastic.search_scroll(prof, field=self.PROF_FIELD, num=size).keys() print(len(doc_ids), "sentences") for i, doc_id in enumerate(doc_ids): tv = self.__elastic.get_termvector(doc_id, self.CONTENT_FIELD, term_stats=True) for t, val in tv.items(): tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"] if t not in df: df[t] = val["doc_freq"] return tf_agg, df
class FeaturesTermStats(): CONTENT_FIELD = "content" STOPWORDS = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" ] def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None def get_per_nat_tf(self, person_id, nats): """ Compute freqPerNat: \frac{|\{s : pe \in s, nt \in s\}|}{|S(pe)|} :param person_id: :param nats: nationality+adj, e.g. Germany, German :return: freqPerNat """ body = {"query": {"bool": {"must": {"term": {"content": person_id}}}}} doc_ids = self.__elastic.search_complex(body, self.CONTENT_FIELD, num=10000).keys() n_s_pe = len(doc_ids) # number of sentences containing person # print(n_s_pe) noun = nats[0] noun_query = self.__elastic.analyze_query(noun) body = { "query": { "bool": { "must": [{ "match": { "content": person_id } }, { "match_phrase": { "content": noun_query } }] } } } doc_ids_noun = self.__elastic.search_complex(body, self.CONTENT_FIELD, num=10000).keys() n_co_noun = len(doc_ids_noun) # print("Noun", n_co_noun) adj = nats[1] adj_query = self.__elastic.analyze_query(adj) body = { "query": { "bool": { "must": [{ "match": { "content": person_id } }, { "match_phrase": { "content": adj_query } }] } } } doc_ids_adj = self.__elastic.search_complex(body, self.CONTENT_FIELD, num=10000).keys() n_co_adj = len(doc_ids_adj) # print("Adj", n_co_adj) if n_s_pe == 0: return 0.0, 0.0 else: return n_co_noun / n_s_pe, n_co_adj / n_s_pe def generate_features(self, kb_file, output_file1, output_file2): """Generate features of freq-person-nationality""" fout1 = open(output_file1, "w") fout2 = open(output_file2, "w") # write tsv header header = ["person", "nationality", "freq_person_nationality_noun"] fout1.write("\t".join(header) + "\n") header = ["person", "nationality", "freq_person_nationality_adj"] fout2.write("\t".join(header) + "\n") with FileUtils.open_file_by_type(kb_file) as kb_f: line_count = 1 for line in kb_f: print(line_count) line_count += 1 person_id, nat_id, noun, adj = line.strip().split("\t") values_noun = [person_id, nat_id] values_adj = [person_id, nat_id] nats = [noun, adj] fpn_noun, fpn_adj = self.get_per_nat_tf(person_id, nats) values_noun.append(str(fpn_noun)) values_adj.append(str(fpn_adj)) fout1.write("\t".join(values_noun) + "\n") fout2.write("\t".join(values_adj) + "\n") fout1.close() fout2.close()
def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None
class FeaturesW2VSim(object): """Implements our simCosW2V feature, i.e., the cosine similarity between the profession and person vectors, \ where the profession (resp. person) vector is the centroid of TFIDF-weighted word2vec vectors of top-K profession \ (resp. person) terms. """ # Formula for feature computation: # # $cos(\vec{t}^{w2v}_{pe, k}, \vec{t}^{w2v}_{pr, k})$, where for item $\in \{pe, pr\}$: # $$\vec{t}^{w2v}_{item, k} = \sum_{t \in T_k(item)} w(t, item) w2v(t)$$ # (note that using these unnormalized sums in the computation of $cos()$ is equivalent to use the actual centroids). CONTENT_FIELD = "content" PROF_FIELD = "professions" K_VALUES = [10, 50, 100, 200, 500, 1000] MAX_K = max(K_VALUES) def __init__(self, index_name=WP_ST_INDEX_ID): self.__elastic = ElasticCache(index_name) self.__stats = None def load_termstats(self, input_file): self.__stats = {} with FileUtils.open_file_by_type(input_file) as f_in: rank = 0 last_prof = None for line in f_in: prof, term, tf, df, tfidf = line.strip().split("\t") if prof != last_prof: rank = 0 last_prof = prof rank += 1 if term in STOPWORDS: # filter stopwords continue if term.startswith("fb_"): # filter entity terms continue if prof not in self.__stats: self.__stats[prof] = {} self.__stats[prof][term] = { "tf": int(tf), "df": int(df), "tfidf": float(tfidf), "rank": rank } def get_person_tf(self, person_id): """Get aggregated TF for a person. :param person_id: dict with TFs. :return: """ doc_ids = self.__elastic.search(person_id, self.CONTENT_FIELD, num=10000).keys() tf_agg = {} for doc_id in doc_ids: tv = self.__elastic.get_termvector( doc_id, self.CONTENT_FIELD) # , term_stats=True) for t, val in tv.items(): tf_agg[t] = tf_agg.get(t, 0) + val["term_freq"] return tf_agg, len(doc_ids) def generate_features(self, kb_file, output_file): """Core function for generating into output_file the features, with person-item data from kb_file. :param kb_file: path to the file with person items (a '.kb'-extension file). :param output_file: :return: """ feat_w2v_approx = FeaturesW2VSimApprox() with open(output_file, "w") as f_out: # write tsv header header = ["person_id", "prof_id"] for k in self.K_VALUES: header.append("simCos_w2v_" + str(k)) f_out.write("\t".join(header) + "\n") for line in FileUtils.read_file_as_list(kb_file): person_id, prof_id = line.split( "\t") # strip() done in read_file_as_list() values = [person_id, prof_id] person_tf, num_sent = self.get_person_tf(person_id) for k in self.K_VALUES: # we take top-K profession terms # compute simCosK # where K is the top-K terms for the profession term_weights_pr = { } # dict from top-K profession terms to their tfidf weights term_weights_pe = { } # dict from top-K person terms to their tfidf weights if prof_id in self.__stats: for term, s in self.__stats[prof_id].items(): if s["rank"] <= k: term_weights_pr[term] = float(s["tfidf"]) idf = s["tfidf"] / s[ "tf"] # we back-generate IDF from profession's TF-IDF term_weights_pe[term] = person_tf.get(term, 0) * idf vec_pr = feat_w2v_approx.get_vector(term_weights_pr) vec_pe = feat_w2v_approx.get_vector(term_weights_pe) cos = cos_sim(vec_pr, vec_pe) else: cos = 0 # in some exceptional cases the profession does not have any sentences values.append(str(cos)) f_out.write("\t".join(values) + "\n")
class Retrieval(object): FIELDED_MODELS = {"mlm", "prms"} LM_MODELS = {"lm", "mlm", "prms"} def __init__(self, config): self.check_config(config) self.__config = config self.__index_name = config["index_name"] self.__first_pass_num_docs = int(config["first_pass"]["1st_num_docs"]) self.__first_pass_field = config["first_pass"]["field"] self.__first_pass_fields_return = config["first_pass"]["fields_return"] self.__first_pass_model = config["first_pass"]["model"] self.__start = int(config["start"]) self.__model = config.get("model", None) self.__num_docs = int(config.get("num_docs", None)) self.__query_file = config.get("query_file", None) self.__output_file = config.get("output_file", None) self.__run_id = config.get("run_id", self.__model) self.__elastic = ElasticCache(self.__index_name) @staticmethod def check_config(config): """Checks config parameters and sets default values.""" try: if config.get("index_name", None) is None: raise Exception("index_name is missing") # Checks first pass parameters if config.get("first_pass", None) is None: config["first_pass"] = {} if config["first_pass"].get("1st_num_docs", None) is None: config["first_pass"]["1st_num_docs"] = 1000 if config["first_pass"].get("field", None) is None: config["first_pass"]["field"] = Elastic.FIELD_CATCHALL if config["first_pass"].get("fields_return", None) is None: config["first_pass"]["fields_return"] = "" if config["first_pass"].get("model", None) is None: config["first_pass"]["model"] = Elastic.BM25 if config.get("start", None) is None: config["start"] = 0 if config.get("num_docs", None) is None: config["num_docs"] = 100 if config.get("model", None) in Retrieval.LM_MODELS: if config.get("smoothing_method", None) is None: config["smoothing_method"] = ScorerLM.DIRICHLET if config.get("smoothing_param", None) is None: if config["smoothing_method"] == ScorerLM.DIRICHLET: config["smoothing_param"] = 2000 elif config["smoothing_method"] == ScorerLM.JM: config["smoothing_param"] = 0.1 else: raise Exception("Smoothing method is not supported.") if config.get("model", None) == "lm": if config.get("fields", None) is None: config["fields"] = Elastic.FIELD_CATCHALL if config.get("model", None) == "mlm": if config.get("fields", None) is None: config["fields"] = {"similar_entity_names": 0.2, "catchall": 0.8} if config.get("model", None) == "prms": if config.get("fields", None) is None: config["fields"] = [Elastic.FIELD_CATCHALL] except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1) def __get_fields(self): """Returns the name of all fields that will be used in the retrieval model.""" fields = [] if type(self.__config["fields"]) == str: fields.append(self.__config["fields"]) elif type(self.__config["fields"]) == dict: fields = self.__config["fields"].keys() else: fields = self.__config["fields"] return fields def _first_pass_scoring(self, analyzed_query): """Returns first-pass scoring of documents. :param analyzed_query: analyzed query :return: RetrievalResults object """ PLOGGER.debug("\tFirst pass scoring... ", ) res1 = self.__elastic.search(analyzed_query, self.__first_pass_field, num=self.__first_pass_num_docs, fields_return=self.__first_pass_fields_return) return res1 def _second_pass_scoring(self, res1, scorer): """Returns second-pass scoring of documents. :param res1: first pass results :param scorer: scorer object :return: RetrievalResults object """ PLOGGER.debug("\tSecond pass scoring... ", ) for field in self.__get_fields(): self.__elastic.multi_termvector(list(res1.keys()), field) res2 = {} for doc_id in res1.keys(): res2[doc_id] = {"score": scorer.score_doc(doc_id), "fields": res1[doc_id].get("fields", {})} PLOGGER.debug("done") return res2 def retrieve(self, query, scorer=None): """Scores documents for the given query.""" query = self.__elastic.analyze_query(query) # 1st pass retrieval res1 = self._first_pass_scoring(query) if self.__model == "bm25": return res1 # 2nd pass retrieval scorer = scorer if scorer else Scorer.get_scorer(self.__elastic, query, self.__config) res2 = self._second_pass_scoring(res1, scorer) return res2 def batch_retrieval(self): """Scores queries in a batch and outputs results.""" queries = json.load(open(self.__query_file)) # init output file open(self.__output_file, "w").write("") out = open(self.__output_file, "w") # retrieves documents for query_id in sorted(queries): PLOGGER.info("scoring [" + query_id + "] " + queries[query_id]) results = self.retrieve(queries[query_id]) out.write(self.trec_format(results, query_id, self.__num_docs)) out.close() PLOGGER.info("Output file:" + self.__output_file) def trec_format(self, results, query_id, max_rank=100): """Outputs results in TREC format""" out_str = "" rank = 1 for doc_id, score in sorted(results.items(), key=lambda x: x[1]["score"], reverse=True): if rank > max_rank: break out_str += query_id + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + str(score["score"]) + "\t" + self.__run_id + "\n" rank += 1 return out_str
def __init__(self, query, retrieval_config): self.__query = query self.__retrieval_config = retrieval_config self.__elasttic = ElasticCache(TC_INDEX)