def main(): pp = pprint.PrettyPrinter() data_dir = 'data/easyCLEF08' txt_file = os.path.join(data_dir, 'easyCLEF08_text.txt') query_file = os.path.join(data_dir, 'easyCLEF08_query.txt') relevants_file = os.path.join(data_dir, 'easyCLEF08_gt.txt') index = Index(directory=data_dir, txt_file=txt_file, create_index=False) okapi = Okapi(index, k1=1.80, b=0.65) diversity = DiversityClustering(DBSCANClustering, index) q = QueryParser(relevants_file) q.q.initFile(query_file) # while True: for i in range(1): query = q.nextQuery() if query is None: break if len(query.relevants) == 0: print('No relevants docs') continue docs_scores = okapi.getRanking(query.getText()) ordered_pred = diversity.order_pred(query, docs_scores)
def compare_models(names, models, measures=['PrecisionRecall', 'AveragePrecision']): rel_filename = 'cacm/cacm.rel' query_filename = 'cacm/cacm.qry' query_parser = QueryParser(rel_filename) query_parser.q.initFile(query_filename) train_queries, _ = query_parser.split_query_dataset() pp = pprint.PrettyPrinter() models_scores = {} queries_results = {name: [] for name in names} for query in train_queries: query_txt = query.getText() for name, model in zip(names, models): scores = model.getRanking(query_txt) results = IRList(query, scores) queries_results[name].append(results) for model_name in names: print('==== {} ===='.format(model_name)) model_score = EvalIRModel(measures).eval_model(queries_results[model_name]) models_scores[model_name] = model_score pp.pprint(models_scores) return models_scores
def write_data(self): filename = 'data/train.dat' with open(filename, 'w') as file: query_parser = QueryParser('cacm/cacm.rel') query_parser.q.initFile('cacm/cacm.qry') train_queries, _ = query_parser.split_query_dataset() index = self.index featurers_count = len(self.featurers_list.featurers) docs_ids = index.docs.keys() N = len(docs_ids) random.seed(SEED) for _, query in enumerate(train_queries): queryId = query.getId() relevants = [str(d[1]) for d in query.relevants] n_relevants = len(relevants) for i, pertinent_doc in enumerate(relevants): non_pertinent_doc = str(random.choice(docs_ids)) while non_pertinent_doc in relevants: non_pertinent_doc = str(random.choice(docs_ids)) pertinent_doc_features = self.featurers_list.get_features( pertinent_doc, query) non_pertinent_doc_features = self.featurers_list.get_features( non_pertinent_doc, query) line1 = '{} qid:{}'.format(1, queryId) for f_idx, f in enumerate(pertinent_doc_features): line1 += ' %d:%.7f' % (f_idx, f) line1 += '\n' file.write(line1) rank = i + 2 line2 = '{} qid:{}'.format(rank, queryId) for f_idx, f in enumerate(non_pertinent_doc_features): line2 += ' %d:%.7f' % (f_idx, f) line2 += '\n' file.write(line2)
def train_linear_model(self, alpha=10e-1, lambda_=10e-5, t_max=1000, decay=10e-7): rel_filename = 'cacm/cacm.rel' query_filename = 'cacm/cacm.qry' query_parser = QueryParser(rel_filename) query_parser.q.initFile(query_filename) train_queries, _ = query_parser.split_query_dataset() iterator = itertools.chain.from_iterable( itertools.repeat(train_queries, t_max / len(train_queries))) index = self.index featurers_count = len(self.featurers_list.featurers) docs_ids = index.docs.keys() theta = np.ones(featurers_count) N = len(docs_ids) base_model = LanguageModel(index) def get_f(theta, features): return np.dot(theta, features) random.seed(SEED) main_losses = [] losses = [] for i, query in enumerate(iterator): alpha *= (1. / (1. + decay * i)) relevants = [d[1] for d in query.relevants] non_pertinent_doc = str(random.choice(docs_ids)) while non_pertinent_doc in relevants: non_pertinent_doc = str(random.choice(docs_ids)) non_pertinent_docs = [non_pertinent_doc] # base_model_preds = [d for d,rank in base_model.getRanking(query.getText())[:20]] # non_pertinent_docs = [str(d) for d in base_model_preds if int(d) not in relevants] # non_pertinent_docs = random.sample(non_pertinent_docs, 1) for non_pertinent_doc in non_pertinent_docs: pertinent_doc = str(random.choice(relevants)) pertinent_doc_features = np.array( self.featurers_list.get_features(pertinent_doc, query)) non_pertinent_doc_features = np.array( self.featurers_list.get_features(non_pertinent_doc, query)) f_pertinent_doc = get_f(theta, pertinent_doc_features) f_non_pertinent_doc = get_f(theta, non_pertinent_doc_features) loss = 1 - f_pertinent_doc + f_non_pertinent_doc losses.append(loss > 0) if loss > 0: theta += alpha * (pertinent_doc_features - non_pertinent_doc_features) theta *= (1 - lambda_ * np.linalg.norm(theta, 2)) if i % 100 == 0: print(i) print('regul', (1 - lambda_ * np.linalg.norm(theta, 2))) print('lr', alpha) print(theta) t_loss = np.mean(losses) print('LOSS = ', t_loss) main_losses.append(t_loss) losses = [] print(main_losses) plt.plot(list(range(len(main_losses))), main_losses) plt.title('Average loss') plt.savefig('plot/meta_model_loss_alpha_=0_5_decay_8') return theta
index = Index() featurers_list = FeaturersList(index) model = LinearMetaModel(index, featurers_list) theta = model.train_linear_model(alpha=.02, t_max=800, lambda_=.001, decay=10e-8) # theta = [2.46331609, 8.24653025, 6.11832922, 0.59504227, -2.89611445, 4.59988454, 1.91155859, 1.62453584] random.seed(0) rel_filename = 'cacm/cacm.rel' query_filename = 'cacm/cacm.qry' query_parser = QueryParser(rel_filename) query_parser.q.initFile(query_filename) _, test_queries = query_parser.split_query_dataset() print('EVAL') queries = [] for query in test_queries: scores = model.getRanking(query, theta) queries.append(IRList(query, scores)) scores = EvalIRModel().eval_model(queries) print(scores) plt.figure(figsize=(10, 8)) y = scores['PrecisionRecall']['mean'] x = list(range(len(y))) plt.plot(x, y)
# Vectoriel(index, SimpleWeighter(index), normalized=True), # Vectoriel(index, ThirdWeighter(index), normalized=True), # Vectoriel(index, FourthWeighter(index), normalized=True), # ] # models = [Okapi(index, k1=1.80, b=0.65), LanguageModel(index, lissage=.8)] # model = LanguageModel(index) # base_model = Okapi(index) # model1 = PageRank(index, seeds=5, k=2) models = [HITS(index, seeds=5, k=2), HITS(index, seeds=5, k=5)] for model in models: q = QueryParser(rel_filename) q.q.initFile(query_filename) queries = [] while True: # for i in range(4): query = q.nextQuery() if query is None: break if len(query.relevants) == 0: print('No relevants docs') continue scores = model.getRanking(query.getText()) queries.append(IRList(query, scores)) print('___________')
#diversity_models = [DiversityClustering(WithoutClustering, index, N=100), # DiversityClustering(RandomClustering, index, N=100), # DiversityClustering(KmeanClustering, index, N=100), # DiversityClustering(DBSCANClustering, index, N=100)] #names = ['Without', 'Random', 'Kmeans', 'DBSCAN'] diversity_models = [DiversityClustering(KmeanClustering, index, N=100)] names = ['Kmeans'] models_scores = {} for diversity, name in zip(diversity_models, names): print('Evaluation of %s clustering'%(name)) q = QueryParser(relevants_file) q.q.initFile(query_file) predicted_docs = [] while True: # for i in range(3): query = q.nextQuery() if query is None: break if len(query.relevants)==0: print('No relevants docs') continue docs_scores = Okapi(index, k1=1.80, b=0.65).getRanking(query.getText()) ordered_pred = diversity.order_pred(query, docs_scores, cluster_order='relevance')