def __init__(self, queries, ole_input_dir, ole_result_files, cognos_input_dir, cognos_result_files, ole_lr_dir): self.queries_data, self.results = [], {"ole":[], "cognos":[]} #self.queries_data, self.results = [], {"ole":[], "cognos":[], "ole_lr": []} for c in queries: tname = "".join(c["q"].split(" OR ")) lname = "".join(c["l"].split()) fn = tname+"_"+lname+"_"+str(0)+".csv" if fn in ole_result_files: fn_cognos = tname+"_"+lname+".csv" self.queries_data.append(c) self.results["ole"].append(get_fields_from_csv_file(ole_input_dir+fn)) self.results["cognos"].append(get_fields_from_csv_file(cognos_input_dir+\ fn_cognos)) #self.results["ole_lr"].append(get_fields_from_csv_file(ole_lr_dir+fn_cognos)) self.get_results_by_ranking_model(ole_lr_dir)
def format_data(self, input_dir, train_percent=70): self.queries_data = [] self.results = {"ole": []} all_files, all_fnames = [], [] for r, _, fs in os.walk(input_dir): for f in fs: all_files.append(r+"/"+f) all_fnames.append(f) for i in self.queries: tname = "".join(i.split()) if tname not in Rank.topic_hash: Rank.topic_hash[tname] = Rank.topic_ctr Rank.topic_ctr += 1 for j in self.locations: lname = "".join(j.split()) fname = get_filename("ole", tname, lname) #include epsilon data also later if fname in all_fnames: ind = all_fnames.index(fname) self.queries_data.append({"q": i, "l":j}) self.results["ole"].append(get_fields_from_csv_file(all_files[ind])) self.data = [] indexes = range(len(self.queries_data)) shuffle(indexes) l = len(indexes) l_train = l*train_percent/100 train_indexes = indexes[:l_train] test_indexes = indexes[l_train:] self.train_data, self.test_data = [] , [] for i in train_indexes: query = self.queries_data[i] result = self.results["ole"][i] self.train_data.append(Rank.get_data_obj(query, result, i)) for i in test_indexes: query = self.queries_data[i] result = self.results["ole"][i] self.test_data.append(Rank.get_data_obj(query, result, i)) write_to_file("train.dat", self.train_data) write_to_file("test.dat", self.test_data)