예제 #1
0
    def __init__(self):
        self._init_execution()
        self._init_preprocessing()
        self._init_data()

        self.logger = get_logger(__file__)
        random.seed(self.random_seed)

        self._setup_data_manager()
        self._setup_vocab_builder()
        self._setup_preprocessor()
        self._setup_data_loader()
        self._store_config()
예제 #2
0
    def __init__(self):
        self._init_execution()
        self._init_preprocessing()
        self._init_distances()
        self._init_binning()
        self._init_data()

        self.logger = get_logger(__file__)
        self._setup_data_managers()
        self._setup_vocabularies()
        self._setup_vocabulary_transformer()
        self._setup_distances_transformer()

        self._store_config()
예제 #3
0
import glob
import random
from copy import copy, deepcopy
from math import floor, ceil
from typing import List, Tuple

import numpy as np

from code_transformer.preprocessing.pipeline.stage2 import CTStage2MultiLanguageSample
from code_transformer.preprocessing.datamanager.base import DataManager, BufferedDataManager
from code_transformer.preprocessing.nlp.vocab import Vocabulary, WordCounter
from code_transformer.utils.io import save_zipped, load_zipped, save_json, load_json
from code_transformer.utils.log import get_logger

logger = get_logger(__file__)


class CTPreprocessedDataManager(DataManager):
    """
    The main data manager for preprocessed data. It takes care of folder structure, loading and saving dataset slices.
    """
    def __init__(self,
                 data_location: str,
                 language: str,
                 partition="train",
                 shuffle=False,
                 infinite_loading=False,
                 mini_dataset=False,
                 load_single_file=None,
                 sort_by_length=False,
                 chunk_size=None,
예제 #4
0
"""
Utility methods and classes for data loading.
The BufferedDataManager allows preloading of dataset slices in a separate thread.
"""

import collections
from abc import ABC, abstractmethod
from queue import Queue
from threading import Thread, Event

import numpy as np

from code_transformer.utils.log import get_logger
from code_transformer.utils.timing import Timing

logger = get_logger(__name__)


class RawDataLoader(ABC):
    @abstractmethod
    def read(self, batch_size=1, shuffle=False):
        pass

    @abstractmethod
    def __len__(self):
        pass


class DataManager(ABC):
    @staticmethod
    def to_batches(generator, batch_size, lazy=False):