Esempio n. 1
0
def build_vocab(texts: List[str]):
    seq_vocab = TokenDictionary()
    for text in texts:
        text = text.replace('\n', '')
        tokens = text.split(' ')
        seq_vocab.add_items(tokens)
    return seq_vocab
    def from_csv(cls, file_path: str):
        data_df = pd.read_csv(file_path, usecols=['text'])

        seq_vocab = TokenDictionary()
        char_vocab = TokenDictionary()

        max_seq_len = 0
        max_token_len = 0
        for i, row in data_df.iterrows():
            tokens = row['text'].split(' ')
            chars = list(set([c for c in row['text']]))

            max_seq_len = max(max_seq_len, len(tokens))
            max_token_len = max(max_token_len, max([len(token) for token in tokens]))

            seq_vocab.add_items(tokens)
            char_vocab.add_items(chars)

        return cls(
            data_df,
            max_seq_len,
            max_token_len,
            seq_vocab,
            char_vocab
        )
    def from_csv(cls, file_path: str):
        data_df = pd.read_csv(file_path, usecols=['text'])

        seq_vocab = TokenDictionary()
        label_dict = Dictionary()

        max_seq_len = 0
        for i, row in data_df.iterrows():
            tokens = row['text'].split(' ')
            max_seq_len = max(max_seq_len, len(tokens))

            seq_vocab.add_items(tokens)
            labels = row['label'].split(' ')
            label_dict.add_items(labels)

        return cls(data_df, max_seq_len, seq_vocab, label_dict)
        clip=5,
        stop_thresh=0.01)
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Load data from file .txt
    train_df = load_from_txt(args.data_path + '/' + args.train_name)
    val_df = load_from_txt(args.data_path + '/' + args.val_name)
    test_df = load_from_txt(args.data_path + '/' + args.test_name)

    train_df.to_csv(args.data_path + '/train.csv', index=False)
    val_df.to_csv(args.data_path + '/val.csv', index=False)
    test_df.to_csv(args.data_path + '/test.csv', index=False)
    # Load vocab and label from file, if not exist create from data.
    if os.path.exists(args.data_path + '/' + args.vocab_name) is True:
        seq_vocab = TokenDictionary.load(args.data_path + '/' +
                                         args.vocab_name)
    else:
        seq_vocab = build_vocab(train_df['text'].tolist())
        seq_vocab.save(args.data_path + '/' + args.vocab_name, delimiter='\t')

    if os.path.exists(args.data_path + '/' + args.intent_name) is True:
        label_dict = Dictionary.load(args.data_path + '/' + args.intent_name)
    else:
        label_dict = get_label_dict(
            pd.concat([train_df, test_df, val_df])['label'].tolist())
        label_dict.save(args.data_path + '/' + args.intent_name,
                        delimiter='\t')

    # Create train, val, test dataset.
    train_dataset = LabelDataset(train_df,
                                 seq_vocab,
        clip=5,
        stop_thresh=0.01)
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Load data from file .txt
    train_df = load_from_txt(args.data_path + '/' + args.train_file)
    val_df = load_from_txt(args.data_path + '/' + args.val_file)
    test_df = load_from_txt(args.data_path + '/' + args.test_file)

    train_df.to_csv(args.data_path + '/train.csv', index=False)
    val_df.to_csv(args.data_path + '/val.csv', index=False)
    test_df.to_csv(args.data_path + '/test.csv', index=False)
    # Load vocab and label from file, if not exist create from data.
    if os.path.exists(args.data_path + '/' + args.vocab_file) is True:
        seq_vocab = TokenDictionary.load(args.data_path + '/' +
                                         args.vocab_file)
    else:
        seq_vocab = build_vocab(train_df['text'].tolist())
        seq_vocab.save(args.data_path + '/' + args.vocab_file, delimiter='\t')

    if os.path.exists(args.data_path + '/' + args.intent_file) is True:
        label_dict = Dictionary.load(args.data_path + '/' + args.intent_file)
    else:
        label_dict = get_label_dict(
            pd.concat([train_df, test_df, val_df])['label'].tolist())
        label_dict.save(args.data_path + '/' + args.intent_file,
                        delimiter='\t')

    if os.path.exists(args.data_path + '/' + args.slot_file) is True:
        tag_dict = TokenDictionary.load(args.data_path + '/' + args.slot_file)
    else:
Esempio n. 6
0
def get_tag_dict(bio_tags: List[str]):
    tag_dict = TokenDictionary(unk_token=None, bos_token=None, eos_token=None)
    for tag in bio_tags:
        tag = tag.replace('\n', '')
        tag_dict.add_items(tag.split(' '))
    return tag_dict