Ejemplo n.º 1
0
def build_sequence_matrix(sequences, inverse_vocabulary, format, length_limit,
                          padding_symbol, padding='right',
                          lowercase=True):
    tokenizer = get_from_registry(format, tokenizer_registry)()
    format_dtype = int_type(len(inverse_vocabulary) - 1)

    max_length = 0
    unit_vectors = []
    for sequence in sequences:
        unit_indices_vector = _get_sequence_vector(
            sequence,
            tokenizer,
            format_dtype,
            inverse_vocabulary,
            lowercase=lowercase
        )
        unit_vectors.append(unit_indices_vector)
        if len(unit_indices_vector) > max_length:
            max_length = len(unit_indices_vector)

    if max_length < length_limit:
        logging.debug('max length of {0}: {1} < limit: {2}'.format(
            format, max_length, length_limit
        ))
    max_length = length_limit
    sequence_matrix = np.full((len(sequences), max_length),
                              inverse_vocabulary[padding_symbol],
                              dtype=format_dtype)
    for i, vector in enumerate(unit_vectors):
        limit = min(vector.shape[0], max_length)
        if padding == 'right':
            sequence_matrix[i, :limit] = vector[:limit]
        else:  # if padding == 'left
            sequence_matrix[i, max_length - limit:] = vector[:limit]
    return sequence_matrix
Ejemplo n.º 2
0
 def feature_data(column, metadata):
     return column.map(
         lambda x: (
             metadata['str2idx'][x.strip()]
             if x.strip() in metadata['str2idx']
             else metadata['str2idx'][UNKNOWN_SYMBOL]
         )
     ).astype(int_type(metadata['vocab_size']))
Ejemplo n.º 3
0
def get_sequence_vector(sequence, format, unit_to_id, lowercase=True):
    format_function = get_from_registry(format, format_registry)
    format_dtype = int_type(len(unit_to_id) - 1)
    return _get_sequence_vector(sequence,
                                format_function,
                                format_dtype,
                                unit_to_id,
                                lowercase=lowercase)
Ejemplo n.º 4
0
def get_sequence_vector(sequence, tokenizer_type, unit_to_id, lowercase=True):
    tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)()
    format_dtype = int_type(len(unit_to_id) - 1)
    return _get_sequence_vector(sequence,
                                tokenizer,
                                format_dtype,
                                unit_to_id,
                                lowercase=lowercase)
Ejemplo n.º 5
0
def build_sequence_matrix(
        sequences,
        inverse_vocabulary,
        tokenizer_type,
        length_limit,
        padding_symbol,
        padding='right',
        unknown_symbol=UNKNOWN_SYMBOL,
        lowercase=True,
        tokenizer_vocab_file=None,
        pretrained_model_name_or_path=None,
        processor=PANDAS,
):
    tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)(
        vocab_file=tokenizer_vocab_file,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
    )

    format_dtype = int_type(len(inverse_vocabulary) - 1)

    unit_vectors = sequences.map(lambda sequence: _get_sequence_vector(
        sequence,
        tokenizer,
        tokenizer_type,
        format_dtype,
        inverse_vocabulary,
        lowercase=lowercase,
        unknown_symbol=unknown_symbol
    ))

    max_length = processor.compute(unit_vectors.map(len).max())
    if max_length < length_limit:
        logging.debug('max length of {0}: {1} < limit: {2}'.format(
            format, max_length, length_limit
        ))
    max_length = length_limit

    def pad(vector):
        sequence = np.full((max_length,),
                           inverse_vocabulary[padding_symbol],
                           dtype=format_dtype)
        limit = min(vector.shape[0], max_length)
        if padding == 'right':
            sequence[:limit] = vector[:limit]
        else:  # if padding == 'left
            sequence[max_length - limit:] = vector[:limit]
        return sequence

    padded = processor.map_objects(unit_vectors, pad)
    return padded
Ejemplo n.º 6
0
 def feature_data(column, metadata):
     return np.array(
         column.map(lambda x: (metadata['str2idx'][x] if x in metadata[
             'str2idx'] else metadata['str2idx'][UNKNOWN_SYMBOL])),
         dtype=int_type(metadata['vocab_size']))