def transform_data(text, word_tokenizer, char_tokenizer): text, numbers = clean_text(text) words = text.split() n = iter(numbers) origin_words = [ next(n).replace(' ', '') if x == '0' else ('\n' if x == '|' else x) for x in words ] seq_len = len(words) chars = char_tokenizer.texts_to_sequences(words) words = word_tokenizer.texts_to_sequences([text]) char_lens = [len(x) for x in chars] chars = pad_sequences(chars, max(char_lens)) return words[0], seq_len, chars, char_lens, origin_words
def transform_data(data, word_tokenizer, char_tokenizer): """ Args: data (dict): an object with tags attribute that is an object containing (startIdx, value) as (key, value) pair in which value is an object with 3 key: 'type', 'end', 'prev' Returns: arrays: list of words and corresponding tags. """ words = [str(word_tokenizer.texts_to_sequences(["`"])[0][0])] labels = [CLASSES[TAGS["normal"]]] characters = [str(char_tokenizer.texts_to_sequences(["`"])[0][0])] char_length = ['1'] text = data['content'] tags = data['tags'] for start in sorted(int(x) for x in tags.keys()): tag = tags[str(start)] end = tag['end'] _type = tag['type'] tokens, _ = clean_text(text[start:end]) words.extend( str(x) for x in word_tokenizer.texts_to_sequences([tokens])[0]) tokens = tokens.split() characters.extend( CHARACTER_SEPARATOR.join( str(x) for x in char_tokenizer.texts_to_sequences([x.strip()])[0]) for x in tokens) char_length.extend(str(len(x.strip())) for x in tokens) if _type == 'normal': labels.extend(CLASSES[TAGS[_type]] for _ in tokens) else: labels.extend(CLASSES[B_TOKEN.format(TAGS[_type])] if i == 0 else CLASSES[I_TOKEN.format(TAGS[_type])] for i, _ in enumerate(tokens)) words.append(str(word_tokenizer.texts_to_sequences(["`"])[0][0])) characters.append(str(char_tokenizer.texts_to_sequences(["`"])[0][0])) labels.append(CLASSES[TAGS["normal"]]) char_length.append("1") return words, characters, char_length, labels
if __name__ == '__main__': import json from data_utils.clean_text import clean_text from data_utils.constants import ALL_TEXTS import os import argparse parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='./data/train') args = parser.parse_args() data = [filename for filename in os.listdir( args.input) if filename.endswith('.json')] with open(os.path.join(os.curdir, ALL_TEXTS), 'w') as file: for filename in data: with open(os.path.join(args.input, filename), 'r') as in_file: in_data = json.load(in_file) file.write('\n'.join(clean_text( x['content'])[0] for x in in_data))
if __name__ == '__main__': import json from data_utils.clean_text import clean_text from constants import ALL_TEXTS import os import argparse parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='./data/train') args = parser.parse_args() data = [ filename for filename in os.listdir(args.input) if filename.endswith('.json') ] with open(os.path.join(os.curdir, ALL_TEXTS), 'w') as file: for filename in data: with open(os.path.join(args.input, filename), 'r') as in_file: in_data = json.load(in_file) file.write('\n'.join( clean_text(x['content'])[0] for x in in_data))
if __name__ == '__main__': import json from data_utils.clean_text import clean_text from data_utils.constants import ALL_TEXTS import os import argparse parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='./data/train') args = parser.parse_args() data = [ filename for filename in os.listdir(args.input) if filename.endswith('.json') ] with open(os.path.join(os.curdir, ALL_TEXTS), 'w') as file: for filename in data: with open(os.path.join(args.input, filename), 'r') as in_file: in_data = json.load(in_file) file.write('\n'.join( '` {} `'.format(clean_text(x['content'])[0]) for x in in_data))