def get_data(self): if isinstance(self.data_source, str): if self.args.lazy_loading: data = get_chunk(read_jsonline_lazy(self.data_source), self.batch_size) else: total_data = read_jsonline(self.data_source) random.shuffle(total_data) random.shuffle(total_data) random.shuffle(total_data) data = get_chunk(total_data, self.batch_size) elif isinstance(self.data_source, list): random.shuffle(self.data_source) random.shuffle(self.data_source) random.shuffle(self.data_source) data = get_chunk(self.data_source, self.batch_size) elif isinstance(self.data_source, dict): golden_filename = self.data_source['golden_filename'] desc_id2item = {} for item in read_jsonline_lazy(golden_filename): desc_id2item[item['description_id']] = item search_filename = self.data_source['search_filename'] topk = self.data_source['topk'] searched_ids = set(self.data_source['searched_id_list']) def build_batch(search_item): qd_pairs = [] desc_id = search_item['description_id'] if desc_id in searched_ids: return [[]] query_text = desc_id2item[desc_id][self.args.query_field] if self.args.rerank_model_name == 'pairwise': docs = search_item['docs'][:topk] for i, doc_id in enumerate(docs): for p_doc_id in docs[:i] + docs[i + 1:]: raw_item = { 'description_id': desc_id, 'query': query_text, 'first_doc_id': doc_id, 'second_doc_id': p_doc_id } qd_pairs.append(raw_item) else: for doc_id in search_item['docs'][:topk]: raw_item = { 'description_id': desc_id, 'query': query_text, 'doc_id': doc_id } qd_pairs.append(raw_item) return get_chunk(qd_pairs, self.batch_size) data = map(build_batch, read_jsonline_lazy(search_filename)) data = chain.from_iterable(data) else: raise ValueError('data type error') return data
def split(self): items = read_jsonline(self.src_filename) random.shuffle(items) random.shuffle(items) random.shuffle(items) training_count = int(len(items) * 0.9) training_items = items[:training_count] test_items = items[training_count:] write_jsonline(DATA_DIR + 'train.jsonl', training_items) write_jsonline(DATA_DIR + 'test.jsonl', test_items)
def __load_data(self, input_data): if isinstance(input_data, str): if input_data.endswith('.json'): data_source = read_json(input_data) elif input_data.endswith('.jsonl'): data_source = read_jsonline(input_data) else: raise ValueError('input file type is not supported, only support .json and .jsonl') elif isinstance(input_data, list): data_source = copy.deepcopy(input_data) else: raise TypeError('input data type error. only accept str (path) and list.') return data_source
def load_data(self, chunk_size): if isinstance(self.data_source, str): if self.lazy_loading: data = read_jsonline_lazy(self.data_source) else: data = read_jsonline(self.data_source) if self.loader.shuffle: random.shuffle(data) random.shuffle(data) random.shuffle(data) elif isinstance(self.data_source, list): data = iter(self.data_source) else: raise TypeError('input filename type is error') return get_chunk(data, chunk_size)