Esempio n. 1
0
    def __init__(self,
                 dataset_name='',
                 traindata=None,
                 testdata=None,
                 batch_size=8,
                 sequence_length=64,
                 mode='tuple',
                 **kwargs):
        self.__name__ = dataset_name
        self.sequence_length = sequence_length
        self.uuid = uuid.uuid4().node
        if mode in ['tuple', 'dict']:
            self.mode = mode
        else:
            raise ValueError("Valid mode should be tuple or dict ")

        self.traindata = traindata
        self.testdata = testdata
        self.annotations = {}

        self.scenario = 'train'
        self._class_names = {}

        self._batch_size = batch_size
        self.__default_language__ = 'en-us'
        if len(self._class_names) > 0:
            if ctx.locale in self._class_names:
                self.__default_language__ = ctx.locale
            for k in self._class_names.keys():
                if ctx.locale.split('-')[0] in k:
                    self.__default_language__ = k
                    break

        if isinstance(self.traindata, Iterator):
            for ds in self.traindata.get_datasets():
                if isinstance(ds, TextSequenceDataset):
                    ds.sequence_length = self.sequence_length
        if isinstance(self.testdata, Iterator):
            for ds in self.testdata.get_datasets():
                if isinstance(ds, TextSequenceDataset):
                    ds.sequence_length = self.sequence_length

        self._idx2lab = {}
        self._lab2idx = {}

        self.tot_minibatch = 0
        self.tot_records = 0
        self.tot_epochs = 0
        self._text_transform_funcs = []
        self._label_transform_funcs = []
        self._paired_transform_funcs = []
        self._batch_transform_funcs = []
        cxt = context._context()
        cxt.regist_data_provider(self)
Esempio n. 2
0
    def __init__(self, axis=-1, sample_weight=None, auto_balance=False, from_logits=False, ignore_index=-100, cutoff=None, label_smooth=False, reduction='mean', enable_ohem=False,
                 ohem_ratio=3.5, name=None, **kwargs):
        """

        Args:
            axis (int): the position where the classes is.
            sample_weight (Tensor): means the weights of  classes , it shoud be a 1D tensor and length the same as
            number of classes.
            from_logits (bool): whether the output tensor is normalized as a probability (total equal to 1)
            ignore_index (int or list of int):
            cutoff (None or decimal): the cutoff point of probability for classification, should be None of a number
            less than 1..
            is_target_onehot (bool): Is the target tensor in onehot format?
            label_smooth (bool): Should use label smoothing?
            reduction (string): the method to aggrgate loss. None means no need to aggregate, 'mean' means average loss,
                'sum' means the summation of losses,'batch_mean' means average loss cross the batch axis then
                summation them.

        Attributes:
            need_target_onehot (bool): If True, means the before loss calculation , need to transform target as one-hot format, ex. label-smooth, default is False.
            is_multiselection (bool): If True, means the classification model is multi-selection, so cannot use  any softmax process, use sigmoid and binary_crosss_entropy
            insteaded.
            is_target_onehot (bool):  If True, means we have confirmed (not just declare) the target is transformed as  one-hot format
            reduction(str): The aggregation function for loss, available options are 'sum', 'mean 'and 'batch_mean', default is 'mean'
            axis (None or int): The axis we according with for loss calculation. Default is 1.
            from_logits (bool):If True, means  the sum of all probability will equal 1.
            is_logsoftmax (bool):If True, means model  use SoftMax as last layer or use any equivalent calculation.
            sample_weight(1D tensor):The loss weight for all classes.
            ignore_index(int , list, tuple): The classes we want to ignore in the loss calculation.
            cutoff(float): Means the decision boundary in this classification model, default=0.5.
            num_classes(int):number of  all the classes.
            label_smooth (bool):If True, mean we will apply label-smoothing in loss calculation.

        """
        super(_ClassificationLoss, self).__init__(reduction=reduction, sample_weight=sample_weight, axis=axis, enable_ohem=enable_ohem, ohem_ratio=ohem_ratio, namename=name)
        self._set_name_scope()
        self.need_target_onehot = True
        self.is_multiselection = False
        self.is_target_onehot = False
        self.from_logits = from_logits
        self.is_logsoftmax = False
        self.ignore_index = ignore_index
        self.ignore_index_weight = None
        self.auto_balance = auto_balance
        self.auto_balance = auto_balance
        if self.auto_balance:
            self.label_statistics = None
            ctx = context._context()
            if hasattr(ctx._thread_local_info, 'data_providers') and len(ctx._thread_local_info.data_providers) > 0:
                dp = list(ctx._thread_local_info.data_providers.values())[0]
                if dp.traindata.label.__class__.__name__ == 'LabelDataset':
                    unique, counts = np.unique(np.array(dp.traindata.label.items), return_counts=True)
                    reweights = np.clip(counts, 1, np.inf) / np.sum(counts).astype(np.float32)

                    reweights1 = np.max(reweights) / reweights
                    self.label_statistics = reweights1
        if cutoff is not None and not 0 < cutoff < 1:
            raise ValueError('cutoff should between 0 and 1')
        self.cutoff = cutoff
        self.num_classes = None
        self.label_smooth = label_smooth
Esempio n. 3
0
import copy
import datetime
import locale
import os
from tqdm import tqdm
from collections import *
from typing import Optional, List, Tuple

from trident.backend.common import *
from trident.backend.pytorch_ops import *
from trident.backend.pytorch_backend import to_tensor, get_device, load, fix_layer, set_device
from trident.data.utils import download_model_from_google_drive, download_file_from_google_drive
from trident.layers.pytorch_layers import *
from trident import context

ctx = context._context()

__all__ = ['Word2Vec', 'ChineseWord2Vec']

_trident_dir = get_trident_dir()
dirname = os.path.join(_trident_dir, 'models')
if not os.path.exists(dirname):
    try:
        os.makedirs(dirname)
    except OSError:
        # Except permission denied and potential race conditions
        # in multi-threaded environments.
        pass

download_path = os.path.join(_trident_dir, 'download', 'vocabs_tw.txt')
make_dir_if_need(download_path)