def __init__(self, train, file, data_id=None, data_dict=None, sample_rankings_per_q=1, shuffle=True, hot=False, eval_dict=None, buffer=True, given_scaler=None): assert data_id is not None or data_dict is not None if data_dict is None: data_dict = self.get_default_data_dict(data_id=data_id) self.train = train if data_dict['data_id'] in MSLETOR or data_dict['data_id'] in MSLRWEB \ or data_dict['data_id'] in YAHOO_LTR or data_dict['data_id'] in YAHOO_LTR_5Fold \ or data_dict['data_id'] in ISTELLA_LTR \ or data_dict['data_id'] == 'IRGAN_MQ2008_Semi': # supported datasets self.check_load_setting(data_dict, eval_dict) perquery_file = get_buffer_file_name(data_id=data_id, file=file, data_dict=data_dict) if sample_rankings_per_q > 1: if hot: torch_perquery_file = perquery_file.replace( '.np', '_'.join([ 'SP', str(sample_rankings_per_q), 'Hot', '.torch' ])) else: torch_perquery_file = perquery_file.replace( '.np', '_'.join(['SP', str(sample_rankings_per_q), '.torch'])) else: if hot: torch_perquery_file = perquery_file.replace( '.np', '_Hot.torch') else: torch_perquery_file = perquery_file.replace( '.np', '.torch') if eval_dict is not None: mask_label, mask_ratio, mask_type = eval_dict[ 'mask_label'], eval_dict['mask_ratio'], eval_dict[ 'mask_type'] print(eval_dict) if mask_label: mask_label_str = '_'.join( [mask_type, 'Ratio', '{:,g}'.format(mask_ratio)]) torch_perquery_file = torch_perquery_file.replace( '.torch', '_' + mask_label_str + '.torch') else: mask_label = False if os.path.exists(torch_perquery_file): print('loading buffered file ...') self.list_torch_Qs = pickle_load(torch_perquery_file) else: self.list_torch_Qs = [] scale_data = data_dict['scale_data'] scaler_id = data_dict[ 'scaler_id'] if 'scaler_id' in data_dict else None list_Qs = iter_queries(in_file=file, data_dict=data_dict, scale_data=scale_data, scaler_id=scaler_id, perquery_file=perquery_file, buffer=buffer) list_inds = list(range(len(list_Qs))) for ind in list_inds: qid, doc_reprs, doc_labels = list_Qs[ind] if sample_rankings_per_q > 1: assert mask_label is not True # not supported since it is rarely used. list_ranking = [] list_labels = [] for _ in range(self.sample_rankings_per_q): des_inds = np_arg_shuffle_ties( doc_labels, descending=True) # sampling by shuffling ties list_ranking.append(doc_reprs[des_inds]) list_labels.append(doc_labels[des_inds]) batch_rankings = np.stack(list_ranking, axis=0) batch_std_labels = np.stack(list_labels, axis=0) torch_batch_rankings = torch.from_numpy( batch_rankings).type(torch.FloatTensor) torch_batch_std_labels = torch.from_numpy( batch_std_labels).type(torch.FloatTensor) else: torch_batch_rankings = torch.from_numpy( doc_reprs).type(torch.FloatTensor) torch_batch_rankings = torch.unsqueeze( torch_batch_rankings, dim=0) # a consistent batch dimension of size 1 torch_batch_std_labels = torch.from_numpy( doc_labels).type(torch.FloatTensor) torch_batch_std_labels = torch.unsqueeze( torch_batch_std_labels, dim=0) if mask_label: # masking if mask_type == 'rand_mask_rele': torch_batch_rankings, torch_batch_std_labels = random_mask_rele_labels( batch_ranking=torch_batch_rankings, batch_label=torch_batch_std_labels, mask_ratio=mask_ratio, mask_value=0, presort=data_dict['presort']) elif mask_type == 'rand_mask_all': masked_res = random_mask_all_labels( batch_ranking=torch_batch_rankings, batch_label=torch_batch_std_labels, mask_ratio=mask_ratio, mask_value=0, presort=data_dict['presort']) if masked_res is not None: torch_batch_rankings, torch_batch_std_labels = masked_res else: continue else: raise NotImplementedError if hot: assert mask_label is not True # not supported since it is rarely used. max_rele_level = data_dict['max_rele_level'] assert max_rele_level is not None torch_batch_std_hot_labels = get_one_hot_reprs( torch_batch_std_labels) batch_cnts = batch_count( batch_std_labels=torch_batch_std_labels, max_rele_grade=max_rele_level, descending=True) self.list_torch_Qs.append( (qid, torch_batch_rankings, torch_batch_std_labels, torch_batch_std_hot_labels, batch_cnts)) else: self.list_torch_Qs.append((qid, torch_batch_rankings, torch_batch_std_labels)) #buffer #print('Num of q:', len(self.list_torch_Qs)) if buffer: parent_dir = Path(torch_perquery_file).parent if not os.path.exists(parent_dir): os.makedirs(parent_dir) pickle_save(self.list_torch_Qs, torch_perquery_file) else: raise NotImplementedError self.hot = hot self.shuffle = shuffle
def __init__(self, split_type, list_as_file, data_id=None, data_dict=None, fold_dir=None, presort=True, alpha=0.5, dictQueryRepresentation=None, dictDocumentRepresentation=None, dictQueryPermutaion=None, dictQueryDocumentSubtopics=None, buffer=True, add_noise=False, std_delta=1.0): self.presort = presort self.add_noise = add_noise ''' split-specific settings ''' self.split_type = split_type self.data_id = data_dict['data_id'] assert presort is True # since it is time-consuming to generate the ideal diversified ranking dynamically. if data_dict['data_id'] in TREC_DIV: # supported datasets torch_buffer_file = fold_dir.replace('folder', 'Bufferedfolder') + split_type.name if self.presort: torch_buffer_file = '_'.join([torch_buffer_file, 'presort', '{:,g}'.format(alpha)]) if self.add_noise: torch_buffer_file = '_'.join([torch_buffer_file, 'gaussian', '{:,g}'.format(std_delta)]) torch_buffer_file += '.torch' if os.path.exists(torch_buffer_file): print('loading buffered file ...') self.list_torch_Qs = pickle_load(torch_buffer_file) else: self.list_torch_Qs = [] for qid in list_as_file: np_q_repr = dictQueryRepresentation[str(qid)] # [1, 100] alphaDCG = dictQueryPermutaion[str(qid)]['alphaDCG'] q_doc_subtopics = dictQueryDocumentSubtopics[str(qid)] perm_docs = dictQueryPermutaion[str(qid)]['permutation'] if self.presort: # print('json-alphaDCG', alphaDCG) # TODO the meaning of json-alphaDCG needs to be confirmed ''' the following comparison shows that the provided permutation of docs is the ideal ranking ''' #print('personal-computation for json', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5)) perm_docs = get_div_ideal_ranking(pool_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, alpha=alpha) #print('personal-computation for ideal', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5)) #print('===') list_doc_reprs = [] for doc in perm_docs: list_doc_reprs.append(dictDocumentRepresentation[doc]) # [1, 100] np_doc_reprs = np.vstack(list_doc_reprs) # [permutation_size, 100] q_repr = torch.from_numpy(np_q_repr).type(torch.FloatTensor) doc_reprs = torch.from_numpy(np_doc_reprs).type(torch.FloatTensor) if self.add_noise: # add gaussian noise q_noise = torch.normal(mean=torch.zeros_like(q_repr), std=std_delta) doc_noise = torch.normal(mean=torch.zeros_like(doc_reprs), std=std_delta) q_repr = torch.add(q_repr, q_noise) doc_reprs = torch.add(doc_reprs, doc_noise) np_rele_mat = to_matrix(perm_docs=perm_docs, q_doc_subtopics=q_doc_subtopics) q_doc_rele_mat = torch.from_numpy(np_rele_mat).type(torch.FloatTensor) self.list_torch_Qs.append((qid, q_repr, perm_docs, doc_reprs, alphaDCG, q_doc_subtopics, q_doc_rele_mat)) #print('Num of q:', len(self.list_torch_Qs)) if buffer: parent_dir = Path(torch_buffer_file).parent if not os.path.exists(parent_dir): os.makedirs(parent_dir) pickle_save(self.list_torch_Qs, torch_buffer_file) else: raise NotImplementedError
def iter_queries(in_file, data_dict=None, scale_data=None, scaler_id=None, perquery_file=None, buffer=True): ''' Transforms an iterator of rows to an iterator of queries (i.e., a unit of all the documents and labels associated with the same query). Each query is represented by a (qid, feature_mat, std_label_vec) tuple. :param in_file: :param has_comment: :param query_level_scale: perform query-level scaling, say normalization :param scaler: MinMaxScaler | RobustScaler :param unknown_as_zero: if not labled, regard the relevance degree as zero :return: ''' if os.path.exists(perquery_file): return pickle_load(perquery_file) if scale_data: scaler = get_scaler(scaler_id=scaler_id) presort, min_docs, min_rele = data_dict['presort'], data_dict[ 'min_docs'], data_dict['min_rele'] unknown_as_zero, binary_rele, has_comment = data_dict[ 'unknown_as_zero'], data_dict['binary_rele'], data_dict['has_comment'] clip_query = False if min_rele is not None and min_rele > 0: clip_query = True if min_docs is not None and min_docs > 0: clip_query = True list_Qs = [] with open(in_file, encoding='iso-8859-1') as file_obj: dict_data = dict() if has_comment: all_features_mat, all_labels_vec, qids, docids = parse_letor( file_obj.readlines(), has_comment=True) for i in range(len(qids)): f_vec = all_features_mat[i, :] std_s = all_labels_vec[i] qid = qids[i] docid = docids[i] if qid in dict_data: dict_data[qid].append((std_s, docid, f_vec)) else: dict_data[qid] = [(std_s, docid, f_vec)] del all_features_mat # unique qids seen = set() seen_add = seen.add # sequential unique id qids_unique = [x for x in qids if not (x in seen or seen_add(x))] for qid in qids_unique: tmp = list(zip(*dict_data[qid])) list_labels_per_q = tmp[0] if data_dict['data_id'] in MSLETOR_LIST: ''' convert the original rank-position into grade-labels ''' ranking_size = len(list_labels_per_q) list_labels_per_q = [ ranking_size - r for r in list_labels_per_q ] #list_docids_per_q = tmp[1] list_features_per_q = tmp[2] feature_mat = np.vstack(list_features_per_q) if scale_data: if data_dict['data_id'] in ISTELLA_LTR: # due to the possible extremely large features, e.g., 1.79769313486e+308 feature_mat = scaler.fit_transform( np.clip(feature_mat, a_min=None, a_max=ISTELLA_MAX)) else: feature_mat = scaler.fit_transform(feature_mat) Q = clip_query_data(qid=qid, feature_mat=feature_mat, std_label_vec=np.array(list_labels_per_q), binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, clip_query=clip_query, min_docs=min_docs, min_rele=min_rele, presort=presort) if Q is not None: list_Qs.append(Q) else: all_features_mat, all_labels_vec, qids = parse_letor( file_obj.readlines(), has_comment=False) for i in range(len(qids)): f_vec = all_features_mat[i, :] std_s = all_labels_vec[i] qid = qids[i] if qid in dict_data: dict_data[qid].append((std_s, f_vec)) else: dict_data[qid] = [(std_s, f_vec)] del all_features_mat # unique qids seen = set() seen_add = seen.add # sequential unique id qids_unique = [x for x in qids if not (x in seen or seen_add(x))] for qid in qids_unique: tmp = list(zip(*dict_data[qid])) list_labels_per_q = tmp[0] if data_dict['data_id'] in MSLETOR_LIST: ''' convert the original rank-position into grade-labels ''' ranking_size = len(list_labels_per_q) list_labels_per_q = [ ranking_size - r for r in list_labels_per_q ] list_features_per_q = tmp[1] feature_mat = np.vstack(list_features_per_q) if data_dict['data_id'] in ISTELLA_LTR: # due to the possible extremely large features, e.g., 1.79769313486e+308 feature_mat = scaler.fit_transform( np.clip(feature_mat, a_min=None, a_max=ISTELLA_MAX)) else: feature_mat = scaler.fit_transform(feature_mat) Q = clip_query_data(qid=qid, feature_mat=feature_mat, std_label_vec=np.array(list_labels_per_q), binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, clip_query=clip_query, min_docs=min_docs, min_rele=min_rele, presort=presort) if Q is not None: list_Qs.append(Q) if buffer: assert perquery_file is not None parent_dir = Path(perquery_file).parent if not os.path.exists(parent_dir): os.makedirs(parent_dir) pickle_save(list_Qs, file=perquery_file) return list_Qs