Ejemplo n.º 1
0
def load_ids(
        col_details: Tuple[str, str, str],
        data_split: Tuple = (60, 40),
        sort_by: dict = None,
        limit: int = None,
        shuffle_data: bool = False,
        shuffle_steps: int = 1,
        mongodb_filter: dict = {}):
    """
    Load MongoDB Document Ids from a collection and split them in training and validation data set
    :param col_details: MongoDB collection details with a tuple of 3 string entries
                        [client name (from config), database name, collection name]
    :param data_split: Tuple of percentage of training and test data e.g. (60, 40) for 60% training and 40% test data
    :param sort_by: MongoDB sort expression. e.g. { created_at: -1 }
    :param limit: maximum number of ids that should be fetched
    :param shuffle_data: determine if dataset should be shuffled before splitting it to train and validation data
    :param shuffle_steps: step size for the shuffling (e.g. for time series you want to have a shuffle_size of
                          BATCH_SIZE + (TIME_STEPS - 1)
    :param mongodb_filter: apply to search when finding all ids
    :return: training and validation data
    """
    Logger.logger.info("Loading Document IDs from MongoDB")
    mongo_con = MongoDBConnect()
    mongo_con.add_connections_from_config(Config.get_config_parser())
    collection = mongo_con.get_collection(*col_details)

    if sort_by is None:
        sort_by = {"_id": 1}

    db_cursor = collection.find(mongodb_filter, sort_by)

    if limit:
        db_cursor.limit(limit)
    tmp_docs = []
    for doc in db_cursor:
        tmp_docs.append(doc["_id"])

    if shuffle_data:
        if shuffle_steps == 1:
            shuffle(tmp_docs)
        else:
            # if reshape the tmp_docs must be a multiple of shuffle_steps, cut ids that do no fit
            overflow = len(tmp_docs) % shuffle_steps
            tmp_docs = tmp_docs[:len(tmp_docs) - overflow]
            x = np.reshape(tmp_docs, (-1, shuffle_steps))
            np.random.shuffle(x)
            tmp_docs = x.flatten().tolist()

    train_range = int((data_split[0] / 100) * len(tmp_docs))
    train_data = tmp_docs[:train_range]
    val_data = tmp_docs[train_range:]
    Logger.logger.info("Documents loaded (train|validation): {0} | {1}\n\n".format(
        len(train_data), len(val_data)))

    return train_data, val_data
Ejemplo n.º 2
0
 def __init__(self,
              col_details: List[Tuple[str, str, str]],
              doc_ids: List[List[any]],
              batch_size: int = 32,
              processors: List[any] = list(),
              cache: ICache = None,
              shuffle_data: bool = True,
              data_group_size: int = 1,
              continues_data_selection: bool = True,
              fix_batch_size: bool = False):
     """
     :param col_details: MongoDB collection details with a tuple of 3 string entries
                         [client name (from config), database name, collection name]
     :param doc_ids: List of doc ids which are used to get the specific data from the MongoDB
     :param batch_size: number of batch size
     :param processors: List of Data processors
     :param cache: Passing instance of a cache e.g. RedisCache, if it is None, no caching is used.
                   Only possible if redis is locally available (has to be installed)
     :param shuffle_data: bool flag to determine if set should be shuffled after epoch is done
     :param data_group_size: number of steps that should be grouped e.g for time series. The data will still only
                             move forward one time step. E.g. for data_group_size=3:
                             [t-5, t-4, t-3], [t-4, t-3, t-2], [t-3, t-2, -1], etc.
                             data will not be shuffled in case data_group_size > 1
     :param fix_batch_size: if true batch size will always be the same, e.g. if batch_size=64 and there are only 63
                            datasamples left for the final batch, these 63 data points will be ignored. In case the
                            batch size of your model is fixed, set this to True.
     """
     super().__init__(batch_size, processors)
     self.doc_ids = doc_ids
     self.cache = cache
     self.shuffle_data = shuffle_data
     self.data_group_size = max(data_group_size, 1)
     self.continues_data_selection = continues_data_selection
     self.docs_per_batch = self.batch_size + (self.data_group_size - 1)
     self.col_details = col_details
     self.fix_batch_size = fix_batch_size
     self.collections = None
     self.mongo_con = MongoDBConnect()
     self.mongo_con.add_connections_from_config(Config.get_config_parser())
     # in case a step_size is chosen > 1, make sure that len(doc_ids) is a multiple of that
     # otherwise reshape will not be working and throw errors
     if self.data_group_size > 1:
         for i in range(len(self.doc_ids)):
             overflow = len(self.doc_ids[i]) % self.docs_per_batch
             self.doc_ids[i] = self.doc_ids[i][:len(self.doc_ids[i]) -
                                               overflow]