Ejemplo n.º 1
0
 def __init__(self, queries, ole_input_dir, ole_result_files,
              cognos_input_dir, cognos_result_files, ole_lr_dir):
   self.queries_data, self.results = [], {"ole":[], "cognos":[]}
   #self.queries_data, self.results = [], {"ole":[], "cognos":[], "ole_lr": []}
   for c in queries:
     tname = "".join(c["q"].split(" OR "))
     lname = "".join(c["l"].split())
     fn = tname+"_"+lname+"_"+str(0)+".csv"
     if fn in ole_result_files:
       fn_cognos = tname+"_"+lname+".csv"
       self.queries_data.append(c)
       self.results["ole"].append(get_fields_from_csv_file(ole_input_dir+fn))
       self.results["cognos"].append(get_fields_from_csv_file(cognos_input_dir+\
                                                              fn_cognos))
       #self.results["ole_lr"].append(get_fields_from_csv_file(ole_lr_dir+fn_cognos))
   self.get_results_by_ranking_model(ole_lr_dir)
Ejemplo n.º 2
0
 def format_data(self, input_dir, train_percent=70):
   self.queries_data = []
   self.results = {"ole": []}
   all_files, all_fnames = [], []
   for r, _, fs in os.walk(input_dir):
     for f in fs:
       all_files.append(r+"/"+f)
       all_fnames.append(f)
   for i in self.queries:
     tname = "".join(i.split())
     if tname not in Rank.topic_hash:
       Rank.topic_hash[tname] = Rank.topic_ctr
       Rank.topic_ctr += 1
     for j in self.locations:
       lname = "".join(j.split())
       fname = get_filename("ole", tname, lname)
       #include epsilon data also later
       if fname in all_fnames:
         ind = all_fnames.index(fname)
         self.queries_data.append({"q": i, "l":j})
         self.results["ole"].append(get_fields_from_csv_file(all_files[ind]))
   self.data = []
   indexes = range(len(self.queries_data))
   shuffle(indexes)
   l = len(indexes)
   l_train = l*train_percent/100
   train_indexes = indexes[:l_train]
   test_indexes = indexes[l_train:]
   self.train_data, self.test_data = [] , []
   for i in train_indexes:
     query = self.queries_data[i]
     result = self.results["ole"][i]
     self.train_data.append(Rank.get_data_obj(query, result, i))
   for i in test_indexes:
     query = self.queries_data[i]
     result = self.results["ole"][i]
     self.test_data.append(Rank.get_data_obj(query, result, i))
   write_to_file("train.dat", self.train_data)
   write_to_file("test.dat", self.test_data)