Beispiel #1
0
    def __init__(self, name, **kwargs):
        super(CharConvEmbeddings, self).__init__()
        self.vsz = kwargs.get('vsz')
        self.dsz = kwargs.get('dsz')
        self.finetune = kwargs.get('finetune', True)
        weights = kwargs.get('weights')
        if weights is None:
            self.embeddings = nn.Embedding(self.vsz, self.dsz, padding_idx=0)
        else:
            self.embeddings = pytorch_embedding(weights)
        char_filtsz = kwargs.get('cfiltsz', [3])
        if is_sequence(char_filtsz[0]):
            char_hsz = [pair[1] for pair in char_filtsz]
            char_filtsz = [pair[0] for pair in char_filtsz]
        else:
            char_hsz = kwargs.get('wsz', 30)

        activation_type = kwargs.get('activation', 'tanh')
        pdrop = kwargs.get('pdrop', 0.5)
        self.char_comp = ParallelConv(self.dsz, char_hsz, char_filtsz,
                                      activation_type, pdrop)
        wchsz = self.char_comp.outsz
        self.linear = pytorch_linear(wchsz, wchsz)
        gating = kwargs.get('gating', 'skip')
        GatingConnection = SkipConnection if gating == 'skip' else Highway
        num_gates = kwargs.get('num_gates', 1)
        self.gating_seq = nn.Sequential(
            OrderedDict([('gate-{}'.format(i), GatingConnection(wchsz))
                         for i in range(num_gates)]))
Beispiel #2
0
    def batch_input(self, tokens):
        """Convert the input into a consistent format.

        :return: List[List[dict[str] -> str]]
        """
        mxlen = 0
        mxwlen = 0
        # Input is a list of strings. (assume strings are tokens)
        if isinstance(tokens[0], six.string_types):
            mxlen = len(tokens)
            tokens_seq = []
            for t in tokens:
                mxwlen = max(mxwlen, len(t))
                tokens_seq.append({'text': t})
            tokens_seq = [tokens_seq]
        else:
            # Better be a sequence, but it could be pre-batched, [[],[]]
            # But what kind of object is at the first one then?
            if is_sequence(tokens[0]):
                tokens_seq = []
                # Then what we have is [['The', 'dog',...], ['I', 'cannot']]
                # [[{'text': 'The', 'pos': 'DT'}, ...

                # For each of the utterances, we need to make a dictionary
                if isinstance(tokens[0][0], six.string_types):
                    for utt in tokens:
                        utt_dict_seq = []
                        mxlen = max(mxlen, len(utt))
                        for t in utt:
                            mxwlen = max(mxwlen, len(t))
                            utt_dict_seq += [dict({'text': t})]
                        tokens_seq += [utt_dict_seq]
                # Its already in dict form so we dont need to do anything
                elif isinstance(tokens[0][0], dict):
                    for utt in tokens:
                        mxlen = max(mxlen, len(utt))
                        for t in utt['text']:
                            mxwlen = max(mxwlen, len(t))
            # If its a dict, we just wrap it up
            elif isinstance(tokens[0], dict):
                mxlen = len(tokens)
                for t in tokens:
                    mxwlen = max(mxwlen, len(t))
                tokens_seq = [tokens]
            else:
                raise Exception('Unknown input format')

        if len(tokens_seq) == 0:
            return []
        return tokens_seq, mxlen, mxwlen
Beispiel #3
0
def format_output(output) -> Union[Dict, List]:
    """Convert the outputs into a consistent format.

    The outputs are dicts. When functions return lists/scalars they are
    converted into dicts with numbers (as str's) for the keys.

    :param output Output to convert
    :return the formatted output
    """
    if is_sequence(output):
        result = {}
        for i, out in enumerate(listify(output)):
            result[str(i)] = out
        output = result
    return output
Beispiel #4
0
    def _get_filtsz(self):
        # If this is a list, then its a tuple of (filtsz, nfeats)
        if is_sequence(self.cfiltsz[0]):
            filtsz = [filter_and_size[0] for filter_and_size in self.cfiltsz]
            nfeats = [filter_and_size[1] for filter_and_size in self.cfiltsz]

        # If we get a nfeat factor, we multiply that by each filter, and thresh at max_feat
        elif self.nfeat_factor:
            max_feat = self.max_feat
            filtsz = self.cfiltsz
            nfeats = [min(self.nfeat_factor * fsz, max_feat) for fsz in filtsz]
        # Otherwise its just a scalar
        else:
            nfeats = self.wsz
            filtsz = self.cfiltsz
        return filtsz, nfeats
Beispiel #5
0
    def vectorize(self, tokens_batch):
        """Turn the input into that batch dict for prediction.

        :param tokens_batch: `List[List[str]]`: The input text batch.

        :returns: dict[str] -> np.ndarray: The vectorized batch.
        """
        examples = defaultdict(list)
        keys = self.vectorizers.keys()
        for i, tokens in enumerate(tokens_batch):
            if is_sequence(tokens[0]):
                if len(tokens) != 2:
                    raise Exception(
                        "We currently only accept dual inputs for multi-encoder"
                    )
                keys = []
                for k, vectorizer in self.vectorizers.items():
                    vec0, length0 = vectorizer.run(tokens[0], self.vocabs[k])
                    vec1, length1 = vectorizer.run(tokens[1], self.vocabs[k])
                    # Its paired data
                    key0 = f'{k}[0]'
                    key1 = f'{k}[1]'
                    keys.append(key0)
                    keys.append(key1)
                    examples[key0].append(vec0)
                    examples[key1].append(vec1)
                    if length0 is not None:
                        lengths_key = f'{key0}_lengths'
                        examples[lengths_key].append(length0)
                    if length1 is not None:
                        lengths_key = f'{key1}_lengths'
                        examples[lengths_key].append(length1)

            else:
                for k, vectorizer in self.vectorizers.items():
                    vec, length = vectorizer.run(tokens, self.vocabs[k])
                    examples[k].append(vec)
                    if length is not None:
                        lengths_key = f'{k}_lengths'
                        examples[lengths_key].append(length)

        for k in keys:
            examples[k] = np.stack(examples[k])
            lengths_key = f'{k}_lengths'
            if lengths_key in examples:
                examples[lengths_key] = np.stack(examples[lengths_key])
        return examples
Beispiel #6
0
def wire_inputs(inputs: Dict, results: Dict, chore: 'Chore') -> Dict:
    """Replace the reference inputs with the output files.

    References are assumed to be of the form `^X.Y` or just `X`.
    If the former, the chore and sub-field of the results Dict
    are returned.  In the latter case, the whole Dict associated
    with the results is returned

    :param inputs: A dictionary of inputs
    :param results: A dictionary of upstream results
    :param chore: The chore function that will be called.
    :returns: The substituted input dictionary
    """
    for key, values in inputs.items():
        if is_sequence(values):
            new_vs = []
            for value in values:
                if is_reference(value):
                    new_vs.append(
                        extract_outputs(parse_reference(value), results))
                else:
                    new_vs.append(value)
            inputs[key] = new_vs
        else:
            if is_reference(values):
                inputs[key] = extract_outputs(parse_reference(values), results)
    # Get the signature of the function
    sig = inspect.signature(chore)
    # Bind the args we populated with inputs
    bound = sig.bind_partial(**inputs)
    for param in sig.parameters.values():
        # Look at all params and if they haven't been bound (they are not
        # present in the bound args and therefore were not in inputs) default
        # them to `None` in inputs. If this param has a default value we
        # don't need to add it to inputs
        if param.name not in bound.arguments and param.default is param.empty:
            inputs[param.name] = None
    return inputs
Beispiel #7
0
    def batch_input(self, tokens):
        """Convert the input into a consistent format.

        :return: List[List[dict[str] -> str]]
        """
        # Input is a list of strings. (assume strings are tokens)
        if isinstance(tokens[0], str):
            tokens_batch = []
            for t in tokens:
                tokens_batch.append({'text': t})
            tokens_batch = [tokens_batch]
        else:
            # Better be a sequence, but it could be pre-batched, [[],[]]
            # But what kind of object is at the first one then?
            if is_sequence(tokens[0]):
                tokens_batch = []
                # Then what we have is [['The', 'dog',...], ['I', 'cannot']]
                # [[{'text': 'The', 'pos': 'DT'}, ...

                # For each of the utterances, we need to make a dictionary
                if isinstance(tokens[0][0], str):
                    for utt in tokens:
                        utt_dict_seq = []
                        for t in utt:
                            utt_dict_seq += [dict({'text': t})]
                        tokens_batch += [utt_dict_seq]
                # Its already in List[List[dict]] form, do nothing
                elif isinstance(tokens[0][0], dict):
                    tokens_batch = tokens
            # If its a dict, we just wrap it up
            elif isinstance(tokens[0], dict):
                tokens_batch = [tokens]
            else:
                raise Exception('Unknown input format')

        if len(tokens_batch) == 0:
            return []
        return tokens_batch
Beispiel #8
0
    def _create_embeddings(self, embeddings_set, vocabs, features):
        """Creates a set of arbitrary sub-graph, DL-framework-specific embeddings by delegating to wired sub-module.

        As part of this process, we take in an index of embeddings by name, a ``dict`` of ``Counter`` objects (keyed by
        feature name), containing the number of times each token has been seen, and a `features` list which is a
        sub-section of the mead config containing the `embeddings` section for each feature.
        This method's job is to either create a sub-graph from a pretrained model, or to create a new random
        initialized sub-graph, taking into account the input vocabulary counters.  The embeddings model has control
        to determine the actual word indices and sub-graph for the embeddings, both of which are returned from this
        method.  If some sort of feature selection is
        performed, such as low count removal that would be required via the delegated methods

        :param embeddings_set: The embeddings index passed to mead driver
        :param vocabs: A set of known ``Counter``s for each vocabulary consisting of a token key and count for each
        :param features: The `features` sub-section of the mead config
        :return: Returns a ``tuple`` comprised of a ``dict`` of (`feature name`, `Embedding`) and an updated vocab
        """


        embeddings_map = {}
        out_vocabs = {}


        for feature in features:
            # Get the block from the features section with key `embeddings`
            embeddings_section = feature['embeddings']

            # The name is at the top level for the feature block of mead config
            name = feature['name']

            # Get the label out of the embeddings section in the features block of mead config
            embed_label = embeddings_section.get('label', embeddings_section.get('labels'))

            # Get the type of embedding out of the embeddings section in the features block of mead config
            embed_type = embeddings_section.get('type', 'default')
            is_stacked = is_sequence(embed_label)
            if is_stacked:
                if embed_type != 'default':
                    logger.warning("You have requested a stack of pretrained embeddings but didnt request 'default' or representation")
            # Backwards compat, copy from main block if not present locally
            embeddings_section['unif'] = embeddings_section.get('unif', self.config_params.get('unif', 0.1))

            # Backwards compat, copy from main block if not present locally
            embeddings_section['keep_unused'] = embeddings_section.get('keep_unused',
                                                                       self.config_params.get('keep_unused', False))

            # Overlay any backend parameters

            # Also, if we are in eager mode, we might have to place the embeddings explicitly on the CPU
            embeddings_section['cpu_placement'] = bool(embeddings_section.get('cpu_placement', False))
            if self.backend.params is not None:
                # If we are in eager mode
                train_block = self.config_params['train']
                optimizer_type = train_block.get('optim', 'sgd')
                # If the optimizer cannot handle embeddings on GPU
                if optimizer_type not in ['sgd', 'adam', 'adamw']:
                    embeddings_section['cpu_placement'] = True
                elif optimizer_type == 'sgd' and float(train_block.get('mom', 0.0)) > 0:
                    embeddings_section['cpu_placement'] = True
                for k, v in self.backend.params.items():
                    embeddings_section[k] = v
            if embed_label is not None:
                # Allow local overrides to uniform initializer

                embed_labels = listify(embed_label)

                embed_files = []
                for embed_label in embed_labels:

                    embeddings_global_config_i = embeddings_set[embed_label]
                    if 'type' in embeddings_global_config_i:
                        embed_type_i = embeddings_global_config_i['type']
                        embed_type = embed_type_i
                        if embed_type_i != 'default' and is_stacked:
                            raise Exception("Stacking embeddings only works for 'default' pretrained word embeddings")

                    embed_file = embeddings_global_config_i.get('file')
                    unzip_file = embeddings_global_config_i.get('unzip', True)
                    embed_dsz = embeddings_global_config_i['dsz']
                    embed_sha1 = embeddings_global_config_i.get('sha1')
                    # Should we grab vocab here too?

                    embed_model = embeddings_global_config_i.get('model', {})
                    if 'dsz' not in embed_model and not is_stacked:
                        embed_model['dsz'] = embed_dsz

                    embeddings_section = {**embed_model, **embeddings_section}
                    try:
                        # We arent necessarily going to get an `embed_file`. For instance, using the HuggingFace
                        # models in the Hub addon, the `embed_file` should be downloaded using HuggingFace's library,
                        # not by us.  In this case we want it to be None and we dont want to download it
                        if embed_file:
                            embed_file = EmbeddingDownloader(embed_file, embed_dsz, embed_sha1, self.data_download_cache, unzip_file=unzip_file).download()
                            embed_files.append(embed_file)
                        else:
                            embed_files.append(None)
                    except Exception as e:
                        if is_stacked:
                            raise e
                        logger.warning(f"We were not able to download {embed_file}, passing to the addon")
                        embed_files.append(embed_file)
                # If we have stacked embeddings (which only works with `default` model, we need to pass the list
                # If not, grab the first item
                embed_file = embed_files if is_stacked else embed_files[0]
                embedding_bundle = baseline.embeddings.load_embeddings(name,
                                                                       embed_file=embed_file,
                                                                       known_vocab=vocabs.get(name),
                                                                       embed_type=embed_type,
                                                                       data_download_cache=self.data_download_cache,
                                                                       **embeddings_section)

                embeddings_map[name] = embedding_bundle['embeddings']
                out_vocabs[name] = embedding_bundle['vocab']
            else:  # if there is no label given, assume we need random initialization vectors
                dsz = embeddings_section.pop('dsz')
                embedding_bundle = baseline.embeddings.load_embeddings(name,
                                                                       dsz=dsz,
                                                                       known_vocab=vocabs[name],
                                                                       embed_type=embed_type,
                                                                       data_download_cache=self.data_download_cache,
                                                                       **embeddings_section)
                embeddings_map[name] = embedding_bundle['embeddings']
                out_vocabs[name] = embedding_bundle['vocab']

        return embeddings_map, out_vocabs
Beispiel #9
0
    def batch_input(self, tokens):
        """Convert the input into a consistent format.

        :return: List[List[dict[str] -> str]]
        """
        mxlen = 0
        mxwlen = 0
        vmxlen, vmxwlen = self.get_vectorizer_lens()
        # Input is a list of strings. (assume strings are tokens)
        if isinstance(tokens[0], six.string_types):
            if vmxlen != -1:
                tokens = tokens[:vmxlen]
            mxlen = len(tokens)
            tokens_seq = []
            for t in tokens:
                mxwlen = max(mxwlen, len(t))
                if vmxwlen != -1:
                    t = t[:vmxwlen]
                tokens_seq.append({'text': t})
            tokens_seq = [tokens_seq]
        else:
            # Better be a sequence, but it could be pre-batched, [[],[]]
            # But what kind of object is at the first one then?
            if is_sequence(tokens[0]):
                tokens_seq = []
                # Then what we have is [['The', 'dog',...], ['I', 'cannot']]
                # [[{'text': 'The', 'pos': 'DT'}, ...

                # For each of the utterances, we need to make a dictionary
                if isinstance(tokens[0][0], six.string_types):
                    for utt in tokens:
                        utt_dict_seq = []
                        if vmxlen != -1:
                            utt = utt[:vmxlen]
                        mxlen = max(mxlen, len(utt))
                        for t in utt:
                            if vmxwlen != -1:
                                t = t[:vmxwlen]
                            mxwlen = max(mxwlen, len(t))
                            utt_dict_seq += [dict({'text': t})]
                        tokens_seq += [utt_dict_seq]
                # Its already in List[List[dict]] form so just iterate to get mxlen and mxwlen
                elif isinstance(tokens[0][0], dict):
                    for utt_dict_seq in tokens:
                        if vmxlen != -1:
                            utt_dict_seq = utt_dict_seq[:vmxlen]
                        mxlen = max(mxlen, len(utt_dict_seq))
                        for token_dict in utt_dict_seq:
                            text = token_dict['text']
                            if vmxwlen != -1:
                                text = text[:vmxwlen]
                                token_dict['text'] = text
                            mxwlen = max(mxwlen, len(text))
                        tokens_seq += [utt_dict_seq]
            # If its a dict, we just wrap it up
            elif isinstance(tokens[0], dict):
                if vmxlen != -1:
                    tokens = tokens[:vmxlen]
                mxlen = len(tokens)
                for t in tokens:
                    text = t['text']
                    if vmxwlen != -1:
                        text = text[:vmxwlen]
                        t['text'] = text
                    mxwlen = max(mxwlen, len(text))
                tokens_seq = [tokens]
            else:
                raise Exception('Unknown input format')

        if len(tokens_seq) == 0:
            return []
        return tokens_seq, mxlen, mxwlen
Beispiel #10
0
def pool_chars(x_char,
               Wch,
               ce0,
               char_dsz,
               nfeat_factor=None,
               cfiltsz=[3],
               max_feat=200,
               gating='skip',
               num_gates=1,
               activation='tanh',
               wsz=30):
    """Take in a tensor of characters (B x maxs x maxw) and do character convolution

    :param x_char: TF tensor for input characters, (B x maxs x maxw)
    :param Wch: A character embeddings matrix
    :param ce0: A control dependency for the embeddings that keeps the <PAD> value 0
    :param char_dsz: The character embedding dsz
    :param kwargs:

    :Keyword Arguments:
    * *cfiltsz* -- (``list``) A list of filters sizes, or a list of tuples of (filter size, num filts)
    * *nfeat_factor* -- (``int``) A factor to be multiplied to filter size to decide number of hidden units
    * *max_feat* -- (``int``) The maximum number of hidden units per filter
    * *gating* -- (``str``) `skip` or `highway` supported, yielding residual conn or highway, respectively
    * *num_gates* -- (``int``) How many gating functions to apply
    * *activation* -- (``str``) A string name of an activation, (e.g. `tanh`)
    :return: The character compositional embedding and the number of hidden units as a tuple

    """
    if is_sequence(cfiltsz[0]):
        filtsz = [filter_and_size[0] for filter_and_size in cfiltsz]
        nfeats = [filter_and_size[1] for filter_and_size in cfiltsz]

    elif nfeat_factor:
        max_feat = max_feat
        filtsz = cfiltsz
        nfeats = [min(nfeat_factor * fsz, max_feat) for fsz in filtsz]
    else:
        filtsz = cfiltsz
        nfeats = wsz
    mxlen = tf.shape(x_char)[1]

    gating_fn = highway_conns if gating.startswith('highway') else skip_conns

    with tf.variable_scope("Chars2Word"):
        with tf.control_dependencies([ce0]):
            mxwlen = tf.shape(x_char)[-1]
            char_bt_x_w = tf.reshape(x_char, [-1, mxwlen])
            cembed = tf.nn.embedding_lookup(Wch,
                                            char_bt_x_w,
                                            name="embeddings")
            cmot, num_filts = char_word_conv_embeddings(
                cembed,
                filtsz,
                char_dsz,
                nfeats,
                activation_fn=tf_activation(activation),
                gating=gating_fn,
                num_gates=num_gates)
            word_char = tf.reshape(cmot, [-1, mxlen, num_filts])

    return word_char, num_filts