def build_sequence_matrix(sequences, inverse_vocabulary, format, length_limit, padding_symbol, padding='right', lowercase=True): tokenizer = get_from_registry(format, tokenizer_registry)() format_dtype = int_type(len(inverse_vocabulary) - 1) max_length = 0 unit_vectors = [] for sequence in sequences: unit_indices_vector = _get_sequence_vector( sequence, tokenizer, format_dtype, inverse_vocabulary, lowercase=lowercase ) unit_vectors.append(unit_indices_vector) if len(unit_indices_vector) > max_length: max_length = len(unit_indices_vector) if max_length < length_limit: logging.debug('max length of {0}: {1} < limit: {2}'.format( format, max_length, length_limit )) max_length = length_limit sequence_matrix = np.full((len(sequences), max_length), inverse_vocabulary[padding_symbol], dtype=format_dtype) for i, vector in enumerate(unit_vectors): limit = min(vector.shape[0], max_length) if padding == 'right': sequence_matrix[i, :limit] = vector[:limit] else: # if padding == 'left sequence_matrix[i, max_length - limit:] = vector[:limit] return sequence_matrix
def feature_data(column, metadata): return column.map( lambda x: ( metadata['str2idx'][x.strip()] if x.strip() in metadata['str2idx'] else metadata['str2idx'][UNKNOWN_SYMBOL] ) ).astype(int_type(metadata['vocab_size']))
def get_sequence_vector(sequence, format, unit_to_id, lowercase=True): format_function = get_from_registry(format, format_registry) format_dtype = int_type(len(unit_to_id) - 1) return _get_sequence_vector(sequence, format_function, format_dtype, unit_to_id, lowercase=lowercase)
def get_sequence_vector(sequence, tokenizer_type, unit_to_id, lowercase=True): tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)() format_dtype = int_type(len(unit_to_id) - 1) return _get_sequence_vector(sequence, tokenizer, format_dtype, unit_to_id, lowercase=lowercase)
def build_sequence_matrix( sequences, inverse_vocabulary, tokenizer_type, length_limit, padding_symbol, padding='right', unknown_symbol=UNKNOWN_SYMBOL, lowercase=True, tokenizer_vocab_file=None, pretrained_model_name_or_path=None, processor=PANDAS, ): tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)( vocab_file=tokenizer_vocab_file, pretrained_model_name_or_path=pretrained_model_name_or_path, ) format_dtype = int_type(len(inverse_vocabulary) - 1) unit_vectors = sequences.map(lambda sequence: _get_sequence_vector( sequence, tokenizer, tokenizer_type, format_dtype, inverse_vocabulary, lowercase=lowercase, unknown_symbol=unknown_symbol )) max_length = processor.compute(unit_vectors.map(len).max()) if max_length < length_limit: logging.debug('max length of {0}: {1} < limit: {2}'.format( format, max_length, length_limit )) max_length = length_limit def pad(vector): sequence = np.full((max_length,), inverse_vocabulary[padding_symbol], dtype=format_dtype) limit = min(vector.shape[0], max_length) if padding == 'right': sequence[:limit] = vector[:limit] else: # if padding == 'left sequence[max_length - limit:] = vector[:limit] return sequence padded = processor.map_objects(unit_vectors, pad) return padded
def feature_data(column, metadata): return np.array( column.map(lambda x: (metadata['str2idx'][x] if x in metadata[ 'str2idx'] else metadata['str2idx'][UNKNOWN_SYMBOL])), dtype=int_type(metadata['vocab_size']))