Example #1
0
    def get_data(self):
        if isinstance(self.data_source, str):
            if self.args.lazy_loading:
                data = get_chunk(read_jsonline_lazy(self.data_source),
                                 self.batch_size)
            else:
                total_data = read_jsonline(self.data_source)
                random.shuffle(total_data)
                random.shuffle(total_data)
                random.shuffle(total_data)
                data = get_chunk(total_data, self.batch_size)
        elif isinstance(self.data_source, list):
            random.shuffle(self.data_source)
            random.shuffle(self.data_source)
            random.shuffle(self.data_source)
            data = get_chunk(self.data_source, self.batch_size)
        elif isinstance(self.data_source, dict):
            golden_filename = self.data_source['golden_filename']
            desc_id2item = {}
            for item in read_jsonline_lazy(golden_filename):
                desc_id2item[item['description_id']] = item
            search_filename = self.data_source['search_filename']
            topk = self.data_source['topk']
            searched_ids = set(self.data_source['searched_id_list'])

            def build_batch(search_item):
                qd_pairs = []
                desc_id = search_item['description_id']
                if desc_id in searched_ids:
                    return [[]]

                query_text = desc_id2item[desc_id][self.args.query_field]
                if self.args.rerank_model_name == 'pairwise':
                    docs = search_item['docs'][:topk]
                    for i, doc_id in enumerate(docs):
                        for p_doc_id in docs[:i] + docs[i + 1:]:
                            raw_item = {
                                'description_id': desc_id,
                                'query': query_text,
                                'first_doc_id': doc_id,
                                'second_doc_id': p_doc_id
                            }
                            qd_pairs.append(raw_item)
                else:
                    for doc_id in search_item['docs'][:topk]:
                        raw_item = {
                            'description_id': desc_id,
                            'query': query_text,
                            'doc_id': doc_id
                        }
                        qd_pairs.append(raw_item)

                return get_chunk(qd_pairs, self.batch_size)

            data = map(build_batch, read_jsonline_lazy(search_filename))
            data = chain.from_iterable(data)
        else:
            raise ValueError('data type error')
        return data
Example #2
0
 def split(self):
     items = read_jsonline(self.src_filename)
     random.shuffle(items)
     random.shuffle(items)
     random.shuffle(items)
     training_count = int(len(items) * 0.9)
     training_items = items[:training_count]
     test_items = items[training_count:]
     write_jsonline(DATA_DIR + 'train.jsonl', training_items)
     write_jsonline(DATA_DIR + 'test.jsonl', test_items)
Example #3
0
 def __load_data(self, input_data):
     if isinstance(input_data, str):
         if input_data.endswith('.json'):
             data_source = read_json(input_data)
         elif input_data.endswith('.jsonl'):
             data_source = read_jsonline(input_data)
         else:
             raise ValueError('input file type is not supported, only support .json and .jsonl')
     elif isinstance(input_data, list):
         data_source = copy.deepcopy(input_data)
     else:
         raise TypeError('input data type error. only accept str (path) and  list.')
     return data_source
Example #4
0
 def load_data(self, chunk_size):
     if isinstance(self.data_source, str):
         if self.lazy_loading:
             data = read_jsonline_lazy(self.data_source)
         else:
             data = read_jsonline(self.data_source)
             if self.loader.shuffle:
                 random.shuffle(data)
                 random.shuffle(data)
                 random.shuffle(data)
     elif isinstance(self.data_source, list):
         data = iter(self.data_source)
     else:
         raise TypeError('input filename type is error')
     return get_chunk(data, chunk_size)