def __init__(self): self._init_execution() self._init_preprocessing() self._init_data() self.logger = get_logger(__file__) random.seed(self.random_seed) self._setup_data_manager() self._setup_vocab_builder() self._setup_preprocessor() self._setup_data_loader() self._store_config()
def __init__(self): self._init_execution() self._init_preprocessing() self._init_distances() self._init_binning() self._init_data() self.logger = get_logger(__file__) self._setup_data_managers() self._setup_vocabularies() self._setup_vocabulary_transformer() self._setup_distances_transformer() self._store_config()
import glob import random from copy import copy, deepcopy from math import floor, ceil from typing import List, Tuple import numpy as np from code_transformer.preprocessing.pipeline.stage2 import CTStage2MultiLanguageSample from code_transformer.preprocessing.datamanager.base import DataManager, BufferedDataManager from code_transformer.preprocessing.nlp.vocab import Vocabulary, WordCounter from code_transformer.utils.io import save_zipped, load_zipped, save_json, load_json from code_transformer.utils.log import get_logger logger = get_logger(__file__) class CTPreprocessedDataManager(DataManager): """ The main data manager for preprocessed data. It takes care of folder structure, loading and saving dataset slices. """ def __init__(self, data_location: str, language: str, partition="train", shuffle=False, infinite_loading=False, mini_dataset=False, load_single_file=None, sort_by_length=False, chunk_size=None,
""" Utility methods and classes for data loading. The BufferedDataManager allows preloading of dataset slices in a separate thread. """ import collections from abc import ABC, abstractmethod from queue import Queue from threading import Thread, Event import numpy as np from code_transformer.utils.log import get_logger from code_transformer.utils.timing import Timing logger = get_logger(__name__) class RawDataLoader(ABC): @abstractmethod def read(self, batch_size=1, shuffle=False): pass @abstractmethod def __len__(self): pass class DataManager(ABC): @staticmethod def to_batches(generator, batch_size, lazy=False):