Exemple #1
0
    def test_generator(data_generator):

        for ids, query, docs in test_gen(data_generator):

            docs_ids = []
            docs_array = []
            docs_mask_array = []
            query_array = []
            query_ids = []

            for i in range(len(ids)):

                for doc in docs[i]:
                    # pad docs, use cache here
                    maybe_padding(doc)
                    docs_array.append(doc["tokens"])
                    docs_mask_array.append(doc["sentences_mask"])
                    docs_ids.append(doc["id"])

                query_tokens = pad_tokens([query[i]], cfg["max_q_terms"])[0]
                query_tokens = [query_tokens] * len(docs[i])
                query_array.append(query_tokens)

                query_ids.append([ids[i]] * len(docs[i]))

            #print(np.array(docs_mask_array))

            yield flat_list(query_ids), [
                np.array(flat_list(query_array)),
                np.array(docs_array),
                np.array(docs_mask_array)
            ], docs_ids, None
Exemple #2
0
    def _generate(self, **kwargs):

        for i in range(len(self.folds_query_list)):
            # create the folds
            test_query = self.folds_query_list[i]
            test_goldstandard_trec_file = self.folds_goldstandard_trec_file[i]
            test_query_docs = self.folds_query_docs[i]

            train_query = flat_list(self.folds_query_list[:i] +
                                    self.folds_query_list[i + 1:])
            train_goldstandard = merge_dicts(self.folds_goldstandard[:i] +
                                             self.folds_goldstandard[i + 1:])
            train_query_docs = merge_dicts(self.folds_query_docs[:i] +
                                           self.folds_query_docs[i + 1:])

            train_collection = TrainCollection(train_query, train_goldstandard,
                                               train_query_docs)

            test_collection = TestCollection(test_query,
                                             test_goldstandard_trec_file,
                                             test_query_docs,
                                             self.trec_script_eval_path,
                                             train_collection.skipped_queries)

            yield train_collection, test_collection
    def _generate(self, **kwargs):
        
        query_ids = []
        queries = []
        query_docs = []
        i=0
        
        for query_data in self.query_list:

            if query_data["id"] in self.skipped_queries:
                continue
            if query_data["id"] not in self.query_docs:
                print("[WARNING] -",query_data["id"],"does not have docs, so it will be skipped")
                continue
            
            while True: #do while
                
                left_space = self.b_size-len(flat_list(query_docs))
                if len(self.query_docs[query_data["id"]][i:])<left_space:
                    # all the documents fit the batch
                    query_docs.append(self.query_docs[query_data["id"]][i:])
                    i=0
                else:
                    # docs do not fit in the batch
                    query_docs.append(self.query_docs[query_data["id"]][i:i+left_space])
                    i = i+left_space
                
                query_ids.append(query_data["id"])
                queries.append(query_data["query"])
                
                # DEBUG PRINTTTTT
                #print(query_data["id"], i, len(flat_list(query_docs)))
                
                
                #ouptup accoring to the batch size
                if len(flat_list(query_docs))>=self.b_size:
                    yield query_ids, queries, query_docs
                    # reset vars
                    query_ids = []
                    queries = []
                    query_docs = []
                
                
                if i==0:
                    break