def build_vocab(texts: List[str]): seq_vocab = TokenDictionary() for text in texts: text = text.replace('\n', '') tokens = text.split(' ') seq_vocab.add_items(tokens) return seq_vocab
def from_csv(cls, file_path: str): data_df = pd.read_csv(file_path, usecols=['text']) seq_vocab = TokenDictionary() char_vocab = TokenDictionary() max_seq_len = 0 max_token_len = 0 for i, row in data_df.iterrows(): tokens = row['text'].split(' ') chars = list(set([c for c in row['text']])) max_seq_len = max(max_seq_len, len(tokens)) max_token_len = max(max_token_len, max([len(token) for token in tokens])) seq_vocab.add_items(tokens) char_vocab.add_items(chars) return cls( data_df, max_seq_len, max_token_len, seq_vocab, char_vocab )
def from_csv(cls, file_path: str): data_df = pd.read_csv(file_path, usecols=['text']) seq_vocab = TokenDictionary() label_dict = Dictionary() max_seq_len = 0 for i, row in data_df.iterrows(): tokens = row['text'].split(' ') max_seq_len = max(max_seq_len, len(tokens)) seq_vocab.add_items(tokens) labels = row['label'].split(' ') label_dict.add_items(labels) return cls(data_df, max_seq_len, seq_vocab, label_dict)
clip=5, stop_thresh=0.01) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") # Load data from file .txt train_df = load_from_txt(args.data_path + '/' + args.train_name) val_df = load_from_txt(args.data_path + '/' + args.val_name) test_df = load_from_txt(args.data_path + '/' + args.test_name) train_df.to_csv(args.data_path + '/train.csv', index=False) val_df.to_csv(args.data_path + '/val.csv', index=False) test_df.to_csv(args.data_path + '/test.csv', index=False) # Load vocab and label from file, if not exist create from data. if os.path.exists(args.data_path + '/' + args.vocab_name) is True: seq_vocab = TokenDictionary.load(args.data_path + '/' + args.vocab_name) else: seq_vocab = build_vocab(train_df['text'].tolist()) seq_vocab.save(args.data_path + '/' + args.vocab_name, delimiter='\t') if os.path.exists(args.data_path + '/' + args.intent_name) is True: label_dict = Dictionary.load(args.data_path + '/' + args.intent_name) else: label_dict = get_label_dict( pd.concat([train_df, test_df, val_df])['label'].tolist()) label_dict.save(args.data_path + '/' + args.intent_name, delimiter='\t') # Create train, val, test dataset. train_dataset = LabelDataset(train_df, seq_vocab,
clip=5, stop_thresh=0.01) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") # Load data from file .txt train_df = load_from_txt(args.data_path + '/' + args.train_file) val_df = load_from_txt(args.data_path + '/' + args.val_file) test_df = load_from_txt(args.data_path + '/' + args.test_file) train_df.to_csv(args.data_path + '/train.csv', index=False) val_df.to_csv(args.data_path + '/val.csv', index=False) test_df.to_csv(args.data_path + '/test.csv', index=False) # Load vocab and label from file, if not exist create from data. if os.path.exists(args.data_path + '/' + args.vocab_file) is True: seq_vocab = TokenDictionary.load(args.data_path + '/' + args.vocab_file) else: seq_vocab = build_vocab(train_df['text'].tolist()) seq_vocab.save(args.data_path + '/' + args.vocab_file, delimiter='\t') if os.path.exists(args.data_path + '/' + args.intent_file) is True: label_dict = Dictionary.load(args.data_path + '/' + args.intent_file) else: label_dict = get_label_dict( pd.concat([train_df, test_df, val_df])['label'].tolist()) label_dict.save(args.data_path + '/' + args.intent_file, delimiter='\t') if os.path.exists(args.data_path + '/' + args.slot_file) is True: tag_dict = TokenDictionary.load(args.data_path + '/' + args.slot_file) else:
def get_tag_dict(bio_tags: List[str]): tag_dict = TokenDictionary(unk_token=None, bos_token=None, eos_token=None) for tag in bio_tags: tag = tag.replace('\n', '') tag_dict.add_items(tag.split(' ')) return tag_dict