Example #1
0
    def build(self):
        print('Build Vocabulary from ', self.path)

        tokenize = BuildVocab.tokenize_text
        TEXT = Field(sequential=True,
                     tokenize=tokenize,
                     lower=True,
                     include_lengths=True,
                     batch_first=True,
                     fix_length=35,
                     use_vocab=True)
        datafields = [('eid', None), ('idxP', None), ('idxC', None),
                      ('MaxDegree', None), ('MaxL', None), ('text', TEXT)]

        data = TabularDataset(path=self.path,
                              format='tsv',
                              skip_header=False,
                              fields=datafields)
        TEXT.build_vocab(data,
                         vectors=GloVe(name='6B', dim=300),
                         max_size=1000)

        #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
        self.stoi = TEXT.vocab.stoi
        self.vectors = TEXT.vocab.vectors
Example #2
0
    def vocab_builder(self):
        #self.eid_field = Field(sequential=False,tokenize)

        print('Build Vocabulary')
        tokenize = BiGraphTextDataset.tokenize_text
        TEXT = Field(sequential=True,
                     tokenize=tokenize,
                     lower=True,
                     include_lengths=True,
                     batch_first=True,
                     fix_length=35,
                     use_vocab=True)

        datafields = [('eid', None), ('idxP', None), ('idxC', None),
                      ('MaxDegree', None), ('MaxL', None), ('text', TEXT)]
        path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt'
        train_data = TabularDataset(path=path,
                                    format='tsv',
                                    skip_header=False,
                                    fields=datafields)
        TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))

        #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)
        self.stoi_dict = TEXT.vocab.stoi
        self.vocab_vectors = TEXT.vocab.vectors
Example #3
0
def buildDataSets():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Model parameter
    MAX_SEQ_LEN = 16
    PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

    # Fields

    label_field = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        dtype=torch.int8)
    text_field = Field(use_vocab=False,
                       tokenize=tokenizer.encode,
                       lower=False,
                       include_lengths=False,
                       batch_first=True,
                       fix_length=MAX_SEQ_LEN,
                       pad_token=PAD_INDEX,
                       unk_token=UNK_INDEX)

    fields = {'label': ('label', label_field), 'text': ('text', text_field)}

    # TabularDataset

    train, valid, test = TabularDataset.splits(path='memesData/data',
                                               train='train.jsonl',
                                               validation='dev_unseen.jsonl',
                                               test='dev_seen.jsonl',
                                               format='JSON',
                                               fields=fields)

    # Iterators

    train_iter = BucketIterator(train,
                                batch_size=8,
                                sort_key=lambda x: len(x.text),
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    valid_iter = BucketIterator(valid,
                                batch_size=8,
                                sort_key=lambda x: len(x.text),
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    test_iter = Iterator(test,
                         batch_size=8,
                         train=False,
                         shuffle=False,
                         sort=False)
    return train_iter, valid_iter, test_iter
Example #4
0
def tokenizer_from(src_lang: str, tgt_lang: str):
    src_lang = _load_lang(src_lang)
    tgt_lang = _load_lang(tgt_lang)

    info('Building tokenizer')
    src_tok = build_tokenizer(src_lang)
    tgt_tok = build_tokenizer(tgt_lang)

    src = Field(tokenize=src_tok)
    tgt = Field(tokenize=tgt_tok)

    return src, tgt
Example #5
0
class DataLoader:
    source: Field = None
    target: Field = None

    def __init__(self, ext, tokenize_en, tokenize_de, sos_token, eos_token):
        self.ext = ext
        self.tokenize_en = tokenize_en
        self.tokenize_de = tokenize_de
        self.sos_token = sos_token
        self.eos_token = eos_token
        print('data initializing start')

    # generate field
    def make_dataset(self):
        if self.ext == ('.de', '.en'):
            self.source = Field(tokenize=self.tokenize_de,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
            self.target = Field(tokenize=self.tokenize_en,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
        elif self.ext == ('.en', '.de'):
            self.source = Field(tokenize=self.tokenize_en,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
            self.target = Field(tokenize=self.tokenize_de,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)

        train_data, valid_data, test_data = Multi30k.splits(
            exts=self.ext, fields=(self.source, self.target))
        return train_data, valid_data, test_data

    # build the vocabulary & mapping integer
    def build_vocab(self, train_data, min_freq):
        # min_freq : lower bound frequency of the word's appearance
        self.source.build_vocab(train_data, min_freq=min_freq)
        self.target.build_vocab(train_data, min_freq=min_freq)

    def make_iter(self, train, validate, test, batch_size, device):
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train, validate, test), batch_size=batch_size, device=device)

        print('dataset initializing done')
        return train_iterator, valid_iterator, test_iterator
Example #6
0
def make_SRC_TRG(tokenize_src, tokenize_trg, lower=False, batch_first=True):
    SRC = Field(tokenize=tokenize_src,
                init_token='<sos>',
                eos_token='<eos>',
                lower=lower,
                batch_first=batch_first)

    TRG = Field(tokenize=tokenize_trg,
                init_token='<sos>',
                eos_token='<eos>',
                lower=lower,
                batch_first=batch_first)

    return SRC, TRG
Example #7
0
    def make_dataset(self):
        if self.ext == ('.de', '.en'):
            self.source = Field(tokenize=self.tokenize_de,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
            self.target = Field(tokenize=self.tokenize_en,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
        elif self.ext == ('.en', '.de'):
            self.source = Field(tokenize=self.tokenize_en,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)
            self.target = Field(tokenize=self.tokenize_de,
                                init_token=self.sos_token,
                                eos_token=self.eos_token,
                                lower=True,
                                batch_first=True)

        train_data, valid_data, test_data = Multi30k.splits(
            exts=self.ext, fields=(self.source, self.target))
        return train_data, valid_data, test_data
Example #8
0
def prepare_data(args):
    TEXT = Field(lower=True,
                 include_lengths=True,
                 batch_first=True,
                 tokenize='spacy',
                 tokenizer_language="en_core_web_sm")
    LABEL = Field(sequential=False)
    # make splits for data

    print("Creating splits")
    if args.subset:
        train, dev, test = SNLI.splits(TEXT, LABEL, root='./subdata')
    else:
        train, dev, test = SNLI.splits(TEXT, LABEL, root='./data')
    print("Loading GloVe")
    glove = torchtext.vocab.GloVe(name='840B', dim=300)
    print("Aligning GloVe vocab")
    TEXT.build_vocab(train, vectors=glove)
    LABEL.build_vocab(train, specials_first=False)
    n_vocab = len(TEXT.vocab.itos)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)
    print("Creating BucketIterator")
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test),
        batch_sizes=(args.batch, 256, 256),
        device=device,
        shuffle=False)
    return TEXT, train_iter, dev_iter, test_iter
Example #9
0
def prepare(params, samples):
    # print(type(params))
    # print(type(samples))
    TEXT = Field(lower=True,
                 include_lengths=True,
                 batch_first=True,
                 tokenize='spacy',
                 tokenizer_language="en_core_web_sm")

    # data = [' '.join(s) for s in samples],
    data = samples
    # print("data",len(data[0]))
    # print(data)
    TEXT.build_vocab(data, vectors=params.glove)

    params.model.emb_vec = torch.nn.Embedding.from_pretrained(
        TEXT.vocab.vectors, freeze=True).to(device=params.device)
    params["TEXT"] = TEXT
Example #10
0
    def __init__(self,
                 fileParams={},
                 tokenizationOption='regex',
                 seedParams={
                     'nFirst': 1,
                     'minFreq': 5
                 },
                 fieldParams={
                     'lower': True,
                     'eos_token': '<!EOS!>'
                 },
                 spacyObj=None):

        self.__fileName, self.__fileExtension, self.__parsingColumn = checkFileParams(
            fileParams)
        self.__seedParams = checkSeedParams(seedParams)
        self.__DataVocab = Field(**fieldParams)
        self.__spacyObj = spacyObj
        self.__customTokenize = self.__tokenizationMethod(tokenizationOption)
        self.__readFile()
Example #11
0
def get_dataset(path_do_data: str, transformer: bool) -> TabularDataset:

    SRC = Field(tokenize=tokenize,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                batch_first=False)

    TRG = Field(
        tokenize=tokenize,
        init_token='<sos>',
        eos_token='<eos>',
        lower=True,
        batch_first=False,
    )

    dataset = TabularDataset(path=path_do_data,
                             format='tsv',
                             fields=[('trg', TRG), ('src', SRC)])
    return SRC, TRG, dataset
Example #12
0
    def __init__(self, device=None, jit=False):
        super().__init__()
        self.device = device
        self.jit = jit

        # Download and the load default data.
        WORD = Field(include_lengths=True)
        UD_TAG = Field(init_token="<bos>",
                       eos_token="<eos>",
                       include_lengths=True)

        # Download and the load default data.
        train, val, test = UDPOS.splits(
            fields=(("word", WORD), ("udtag", UD_TAG), (None, None)),
            filter_pred=lambda ex: 5 < len(ex.word) < 30,
        )

        WORD.build_vocab(train.word, min_freq=3)
        UD_TAG.build_vocab(train.udtag)
        self.train_iter = torch_struct.data.TokenBucket(train,
                                                        batch_size=100,
                                                        device=device)

        H = 256
        T = 30
        NT = 30
        self.model = NeuralCFG(len(WORD.vocab), T, NT, H)
        if jit:
            self.model = torch.jit.script(self.model)
        self.model.to(device=device)
        self.opt = torch.optim.Adam(self.model.parameters(),
                                    lr=0.001,
                                    betas=[0.75, 0.999])
        for i, ex in enumerate(self.train_iter):
            words, lengths = ex.word
            self.words = words.long().to(device).transpose(0, 1)
            self.lengths = lengths.to(device)
            break
Example #13
0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')


def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


german = Field(tokenize=tokenizer_ger,
               lower=True,
               init_token='<sos>',
               eos_token='<eos>')
english = Field(tokenize=tokenizer_eng,
                lower=True,
                init_token='<sos>',
                eos_token='<eos>')

train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                         fields=(german,
                                                                 english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


# model
Example #14
0
parser = argparse.ArgumentParser()
parser.add_argument('--debug',
                    metavar='fn',
                    default="",
                    help="Dump outputs into file")
parser.add_argument('--script', default=False, help="Script the model")
args = parser.parse_args()

random.seed(1337)
torch.manual_seed(1337)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Download and the load default data.
WORD = Field(include_lengths=True)
UD_TAG = Field(init_token="<bos>", eos_token="<eos>", include_lengths=True)

# Download and the load default data.
train, val, test = UDPOS.splits(
    fields=(("word", WORD), ("udtag", UD_TAG), (None, None)),
    filter_pred=lambda ex: 5 < len(ex.word) < 30,
)

WORD.build_vocab(train.word, min_freq=3)
UD_TAG.build_vocab(train.udtag)
train_iter = torch_struct.data.TokenBucket(train,
                                           batch_size=100,
                                           device="cuda:0")

H = 256
Example #15
0
spacy_en = spacy.load("en_core_web_sm")


def tokenize_de(text):

    return [tok.text for tok in spacy_de.tokenizer(text)]


def tokenize_en(text):

    return [tok.text for tok in spacy_en.tokenizer(text)]


SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TGT = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TGT))

SRC.build_vocab(train_data, min_freq=2)
TGT.build_vocab(train_data, min_freq=2)
Example #16
0
# embedding = FastText('simple')
embedding = GloVe(name='6B', dim=50)

data_dir = './data/sats-data/'
train_ = np.load(data_dir + 'train_sents.npy', allow_pickle=True)
train_labels = np.load(data_dir + 'labels_train.npy', allow_pickle=True)
eval_ = np.load(data_dir + 'eval_sents.npy', allow_pickle=True)
eval_labels = np.load(data_dir + 'labels_val.npy', allow_pickle=True)

texts = np.concatenate((train_, eval_))
labels = np.concatenate((train_labels, eval_labels))

df = pd.DataFrame({'text': texts, 'label': labels})

text_field = Field(sequential=True,
                   tokenize='basic_english',
                   fix_length=5,
                   lower=True)

label_field = Field(sequential=False, use_vocab=False, is_target=True)

preprocessed_text = df['text'].apply(lambda x: text_field.preprocess(x))
# text_field.build_vocab(preprocessed_text, vectors='fasttext.simple.300d')
text_field.build_vocab(preprocessed_text, vectors='glove.6B.50d')
vocab = text_field.vocab

ltoi = {l: i for i, l in enumerate(df['label'].unique())}
df['label'] = df['label'].apply(lambda y: ltoi[y])


class DataFrameDataset(torchtext.legacy.data.Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
Example #17
0
# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
start_debugger_on_exception()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda:6')
# Model parameter
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Fields

label_field = Field(sequential=False,
                    use_vocab=False,
                    batch_first=True,
                    dtype=torch.float)
text_field = Field(use_vocab=False,
                   tokenize=tokenizer.encode,
                   lower=False,
                   include_lengths=False,
                   batch_first=True,
                   fix_length=MAX_SEQ_LEN,
                   pad_token=PAD_INDEX,
                   unk_token=UNK_INDEX)
fields = [('index', label_field), ('text', text_field), ('label', label_field)]

# TabularDataset

train, valid, test = TabularDataset.splits(path='./data',
                                           train='IMDB_single.csv',
        wandb.run.name = wandb.run.name + '-fc'
    elif transition_type == 'depth_wise_conv':
        wandb.run.name = wandb.run.name + '-dwc'
wandb.run.save()
train_loss_key = "Train loss"
valid_loss_key = "Validation loss"
test_loss_key = "Test loss"
char_accuracy_key = "Character accuracy"
seq_accuracy_key = "Sequence accuracy"
"""
Preparing Data
"""
tokenize = lambda x: x.split()
INPUT = Field(sequential=True,
              tokenize=tokenize,
              init_token='<sos>',
              eos_token='<eos>',
              lower=True)
TARGET = Field(sequential=True,
               tokenize=tokenize,
               init_token='<sos>',
               eos_token='<eos>',
               lower=True)

datafields = [("input", INPUT), ("target", TARGET)]

trn, vld, tst = TabularDataset.splits(path="data/" + data_size,
                                      train=train_csv,
                                      validation=validation_csv,
                                      test=test_csv,
                                      format='csv',
Example #19
0
import torch
from torchtext.legacy import data
from torchtext.legacy.data import Field
from torchtext.legacy.data import LabelField
from torchtext.legacy import datasets
from nltk.tokenize import word_tokenize
import torch.nn as nn
import random
import torch.optim as optim
import time

tokenizer = word_tokenize
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = Field(tokenize=tokenizer, include_lengths=True)
LABEL = LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.6B.300d",
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, vaild_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
Example #20
0
from torchtext.legacy.data import Field,BucketIterator
import spacy
import random
import torch.optim as opt
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

eng=spacy.load('en')
ger=spacy.load('de_core_news_sm')

def Tokenize_eng(text):
  return [a.text for a in eng.tokenizer(text)]
def Tokenize_german(text):
  return [b.text for b in ger.tokenizer(text)]

german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>')
english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>')

Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english))

german.build_vocab(Train,max_size=10000,min_freq=2)
english.build_vocab(Train,max_size=10000,min_freq=2)

##building encoder
class Encoder(Module):
  def __init__(self,inp_size,emd_size,hidden_size):
    super(Encoder,self).__init__()
    self.inp_size=inp_size
    self.emd_size=emd_size
    self.hidden_size=hidden_size
    self.drop=Dropout(0.5)
Example #21
0
import torchtext
import torch.nn as nn
import torch.optim as optim
import spacy
from torchtext.legacy.data import Field, TabularDataset, BucketIterator

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

#python -m spacy download en
spacy_en = spacy.load("en")

def tokenize(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

Texto = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
Valoracion = Field(sequential=False, use_vocab=False)

fields = {"Texto": ("t", Texto), "Valoracion": ("v", Valoracion)}

train_data, test_data = TabularDataset.splits(
                                        path='/content/Dataset',
                                        train='train.csv',
                                        test='test.csv',
                                        format='csv',
                                        fields=fields)

len(train_data) , len(test_data)

print(vars(train_data.examples[0]))
Example #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-raw_dir', required=True)
    parser.add_argument('-data_dir', required=True)
    parser.add_argument('-codes', required=True)
    parser.add_argument('-save_data', required=True)
    parser.add_argument('-prefix', required=True)
    parser.add_argument('-max_len', type=int, default=100)
    parser.add_argument('--symbols',
                        '-s',
                        type=int,
                        default=32000,
                        help="Vocabulary size")
    parser.add_argument(
        '--min-frequency',
        type=int,
        default=6,
        metavar='FREQ',
        help=
        'Stop if no symbol pair has frequency >= FREQ (default: %(default)s))')
    parser.add_argument(
        '--dict-input',
        action="store_true",
        help=
        "If set, input file is interpreted as a dictionary where each line contains a word-count pair"
    )
    parser.add_argument(
        '--separator',
        type=str,
        default='@@',
        metavar='STR',
        help=
        "Separator between non-final subword units (default: '%(default)s'))")
    parser.add_argument('--total-symbols', '-t', action="store_true")
    opt = parser.parse_args()

    # Create folder if needed.
    mkdir_if_needed(opt.raw_dir)
    mkdir_if_needed(opt.data_dir)

    # Download and extract raw data.
    raw_train = get_raw_files(opt.raw_dir, _TRAIN_DATA_SOURCES)
    raw_val = get_raw_files(opt.raw_dir, _VAL_DATA_SOURCES)
    raw_test = get_raw_files(opt.raw_dir, _TEST_DATA_SOURCES)

    # Merge files into one.
    train_src, train_trg = compile_files(opt.raw_dir, raw_train,
                                         opt.prefix + '-train')
    val_src, val_trg = compile_files(opt.raw_dir, raw_val, opt.prefix + '-val')
    test_src, test_trg = compile_files(opt.raw_dir, raw_test,
                                       opt.prefix + '-test')

    # Build up the code from training files if not exist
    opt.codes = os.path.join(opt.data_dir, opt.codes)
    if not os.path.isfile(opt.codes):
        sys.stderr.write(
            f"Collect codes from training data and save to {opt.codes}.\n")
        learn_bpe(raw_train['src'] + raw_train['trg'], opt.codes, opt.symbols,
                  opt.min_frequency, True)
    sys.stderr.write(f"BPE codes prepared.\n")

    sys.stderr.write(f"Build up the tokenizer.\n")
    with codecs.open(opt.codes, encoding='utf-8') as codes:
        bpe = BPE(codes, separator=opt.separator)

    sys.stderr.write(f"Encoding ...\n")
    encode_files(bpe, train_src, train_trg, opt.data_dir,
                 opt.prefix + '-train')
    encode_files(bpe, val_src, val_trg, opt.data_dir, opt.prefix + '-val')
    encode_files(bpe, test_src, test_trg, opt.data_dir, opt.prefix + '-test')
    sys.stderr.write(f"Done.\n")

    field = Field(tokenize=str.split,
                  lower=True,
                  pad_token=Constants.PAD_WORD,
                  init_token=Constants.BOS_WORD,
                  eos_token=Constants.EOS_WORD)

    fields = (field, field)

    MAX_LEN = opt.max_len

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN

    enc_train_files_prefix = opt.prefix + '-train'
    train = TranslationDataset(fields=fields,
                               path=os.path.join(opt.data_dir,
                                                 enc_train_files_prefix),
                               exts=('.src', '.trg'),
                               filter_pred=filter_examples_with_length)

    from itertools import chain
    field.build_vocab(chain(train.src, train.trg), min_freq=2)

    data = {
        'settings': opt,
        'vocab': field,
    }
    opt.save_data = os.path.join(opt.data_dir, opt.save_data)

    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    pickle.dump(data, open(opt.save_data, 'wb'))
Example #23
0
def translate(cfg_file: str,
              ckpt: str,
              output_path: str = None,
              batch_class: Batch = Batch,
              n_best: int = 1) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output file
    :param batch_class: class type of batch
    :param n_best: amount of candidates to display
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=test_data, batch_size=batch_size,
            batch_class=batch_class, batch_type=batch_type, level=level,
            max_output_length=max_output_length, eval_metric="",
            use_cuda=use_cuda, compute_loss=False, beam_size=beam_size,
            beam_alpha=beam_alpha, postprocess=postprocess,
            bpe_type=bpe_type, sacrebleu=sacrebleu, n_gpu=n_gpu, n_best=n_best)
        return hypotheses

    cfg = load_config(cfg_file)
    model_dir = cfg["training"]["model_dir"]

    _ = make_logger(model_dir, mode="translate")
    # version string returned

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        ckpt = get_latest_checkpoint(model_dir)

    # read vocabs
    src_vocab_file = cfg["data"].get("src_vocab", model_dir + "/src_vocab.txt")
    trg_vocab_file = cfg["data"].get("trg_vocab", model_dir + "/trg_vocab.txt")
    src_vocab = Vocabulary(file=src_vocab_file)
    trg_vocab = Vocabulary(file=trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    tok_fun = lambda s: list(s) if level == "char" else s.split()

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # parse test args
    batch_size, batch_type, use_cuda, device, n_gpu, level, _, \
        max_output_length, beam_size, beam_alpha, postprocess, \
        bpe_type, sacrebleu, _, _ = parse_test_args(cfg, mode="translate")

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.to(device)

    if not sys.stdin.isatty():
        # input file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        all_hypotheses = _translate_data(test_data)

        if output_path is not None:
            # write to outputfile if given

            def write_to_file(output_path_set, hypotheses):
                with open(output_path_set, mode="w", encoding="utf-8") \
                        as out_file:
                    for hyp in hypotheses:
                        out_file.write(hyp + "\n")
                logger.info("Translations saved to: %s.", output_path_set)

            if n_best > 1:
                for n in range(n_best):
                    file_name, file_extension = os.path.splitext(output_path)
                    write_to_file(
                        "{}-{}{}".format(
                            file_name, n,
                            file_extension if file_extension else ""), [
                                all_hypotheses[i]
                                for i in range(n, len(all_hypotheses), n_best)
                            ])
            else:
                write_to_file("{}".format(output_path), all_hypotheses)
        else:
            # print to stdout
            for hyp in all_hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        batch_type = "sentence"
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)
                hypotheses = _translate_data(test_data)

                print("JoeyNMT: Hypotheses ranked by score")
                for i, hyp in enumerate(hypotheses):
                    print("JoeyNMT #{}: {}".format(i + 1, hyp))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
Example #24
0
def main_wo_bpe():
    '''
    Usage: python preprocess.py -lang_src de -lang_trg en -save_data multi30k_de_en.pkl -share_vocab
    '''

    spacy_support_langs = [
        'de', 'el', 'en', 'es', 'fr', 'it', 'lt', 'nb', 'nl', 'pt'
    ]

    parser = argparse.ArgumentParser()
    parser.add_argument('-lang_src',
                        required=True,
                        choices=spacy_support_langs)
    parser.add_argument('-lang_trg',
                        required=True,
                        choices=spacy_support_langs)
    parser.add_argument('-save_data', required=True)
    parser.add_argument('-data_src', type=str, default=None)
    parser.add_argument('-data_trg', type=str, default=None)

    parser.add_argument('-max_len', type=int, default=100)
    parser.add_argument('-min_word_count', type=int, default=3)
    parser.add_argument('-keep_case', action='store_true')
    parser.add_argument('-share_vocab', action='store_true')
    #parser.add_argument('-ratio', '--train_valid_test_ratio', type=int, nargs=3, metavar=(8,1,1))
    #parser.add_argument('-vocab', default=None)

    opt = parser.parse_args()
    assert not any([opt.data_src, opt.data_trg
                    ]), 'Custom data input is not support now.'
    assert not any([opt.data_src, opt.data_trg]) or all(
        [opt.data_src, opt.data_trg])
    print(opt)

    src_lang_model = spacy.load(opt.lang_src)
    trg_lang_model = spacy.load(opt.lang_trg)

    def tokenize_src(text):
        return [tok.text for tok in src_lang_model.tokenizer(text)]

    def tokenize_trg(text):
        return [tok.text for tok in trg_lang_model.tokenizer(text)]

    SRC = Field(tokenize=tokenize_src,
                lower=not opt.keep_case,
                pad_token=Constants.PAD_WORD,
                init_token=Constants.BOS_WORD,
                eos_token=Constants.EOS_WORD)

    TRG = Field(tokenize=tokenize_trg,
                lower=not opt.keep_case,
                pad_token=Constants.PAD_WORD,
                init_token=Constants.BOS_WORD,
                eos_token=Constants.EOS_WORD)

    MAX_LEN = opt.max_len
    MIN_FREQ = opt.min_word_count

    if not all([opt.data_src, opt.data_trg]):
        assert {opt.lang_src, opt.lang_trg} == {'de', 'en'}
    else:
        # Pack custom txt file into example datasets
        raise NotImplementedError

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN

    train, val, test = Multi30k.splits(exts=('.' + opt.lang_src,
                                             '.' + opt.lang_trg),
                                       fields=(SRC, TRG),
                                       filter_pred=filter_examples_with_length)

    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    print('[Info] Get source language vocabulary size:', len(SRC.vocab))
    TRG.build_vocab(train.trg, min_freq=MIN_FREQ)
    print('[Info] Get target language vocabulary size:', len(TRG.vocab))

    if opt.share_vocab:
        print('[Info] Merging two vocabulary ...')
        for w, _ in SRC.vocab.stoi.items():
            # TODO: Also update the `freq`, although it is not likely to be used.
            if w not in TRG.vocab.stoi:
                TRG.vocab.stoi[w] = len(TRG.vocab.stoi)
        TRG.vocab.itos = [None] * len(TRG.vocab.stoi)
        for w, i in TRG.vocab.stoi.items():
            TRG.vocab.itos[i] = w
        SRC.vocab.stoi = TRG.vocab.stoi
        SRC.vocab.itos = TRG.vocab.itos
        print('[Info] Get merged vocabulary size:', len(TRG.vocab))

    data = {
        'settings': opt,
        'vocab': {
            'src': SRC,
            'trg': TRG
        },
        'train': train.examples,
        'valid': val.examples,
        'test': test.examples
    }

    print('[Info] Dumping the processed data to pickle file', opt.save_data)
    pickle.dump(data, open(opt.save_data, 'wb'))
Example #25
0
def train():
    spacy_ger = de_core_news_md.load()
    spacy_eng = en_core_web_sm.load()

    def tokenize_ger(text):
        return [tok.text for tok in spacy_ger.tokenizer(text)]

    def tokenize_eng(text):
        return [tok.text for tok in spacy_eng.tokenizer(text)]

    german = Field(tokenize=tokenize_ger,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>")

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")

    train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                        fields=(german,
                                                                english))

    german.build_vocab(train_data, max_size=10000, min_freq=2)
    english.build_vocab(train_data, max_size=10000, min_freq=2)

    ### We're ready to define everything we need for training our Seq2Seq model ###

    # Training hyperparameters
    num_epochs = 20
    learning_rate = 0.001
    batch_size = 64

    # Model hyperparameters
    load_model = False
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_size_encoder = len(german.vocab)
    input_size_decoder = len(english.vocab)
    output_size = len(english.vocab)
    encoder_embedding_size = 300
    decoder_embedding_size = 300
    hidden_size = 1024  # Needs to be the same for both RNN's
    num_layers = 2
    enc_dropout = 0.5
    dec_dropout = 0.5

    # Tensorboard to get nice loss plot
    writer = SummaryWriter(f"runs/loss_plot")
    step = 0

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device,
    )

    encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                          hidden_size, num_layers, enc_dropout).to(device)

    decoder_net = Decoder(
        input_size_decoder,
        decoder_embedding_size,
        hidden_size,
        output_size,
        num_layers,
        dec_dropout,
    ).to(device)

    model = Seq2Seq(encoder_net, decoder_net, len(english.vocab),
                    device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(
        f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters"
    )

    pad_idx = english.vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    if load_model:
        load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model,
                        optimizer)

    sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

    for epoch in range(num_epochs):
        print(
            f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]"
        )

        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        # save_checkpoint(checkpoint)

        model.eval()

        translated_sentence = translate_sentence(model,
                                                 sentence,
                                                 german,
                                                 english,
                                                 device,
                                                 max_length=50)

        print(f"Translated example sentence: \n {translated_sentence}")

        model.train()

        for batch_idx, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            inp_data = batch.src.to(device)
            target = batch.trg.to(device)

            # Forward prop
            output = model(inp_data, target)

            # print('\n')
            # print('Input', inp_data.shape)
            # print('Target', target.shape)
            # print('Output', output.shape)
            # print('---------------------')

            # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
            # doesn't take input in that form. For example if we have MNIST we want to have
            # output to be: (N, 10) and targets just (N). Here we can view it in a similar
            # way that we have output_words * batch_size that we want to send in into
            # our cost function, so we need to do some reshapin. While we're at it
            # Let's also remove the start token while we're at it
            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            # Back prop
            loss.backward()

            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

            # Plot to tensorboard
            writer.add_scalar("Training loss", loss, global_step=step)
            # print("Training loss", loss)
            step += 1

    score = bleu(test_data[1:100], model, german, english, device)
    print(f"Bleu score {score*100:.2f}")
    batch_size = 8

    tokenizer = AutoTokenizer.from_pretrained(phobert_path, use_fast=False)
    init_token = tokenizer.cls_token
    eos_token = tokenizer.sep_token
    
    
    init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
    eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
    pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
    unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

    TEXT = Field(batch_first=True,
                 use_vocab=False,
                 tokenize=tokenize_and_cut,
                 preprocessing=tokenizer.convert_tokens_to_ids,
                 init_token=init_token_idx,
                 eos_token=eos_token_idx,
                 pad_token=pad_token_idx,
                 unk_token=unk_token_idx)
    LABEL = LabelField(dtype=torch.long, use_vocab=False)
    fields = [('data', TEXT), ('label', LABEL)]
    train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='validation.csv', test='test.csv',
                                               format='CSV', fields=fields, skip_header=True)

    train_generator, val_generator, test_generator = BucketIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        device=device, sort=False)

    criterion = nn.CrossEntropyLoss()
Example #27
0
import flor
from multiprocessing import set_start_method

try:
    set_start_method("spawn")
except RuntimeError:
    pass

flor.flags.NAME = "kaggle-nlp-disasters-rnn"
flor.flags.REPLAY = False

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

label_field = Field(sequential=False,
                    use_vocab=False,
                    batch_first=True,
                    dtype=torch.float)
text_field = Field(
    tokenize=flor.log("tokenizer", "spacy"),
    lower=True,
    include_lengths=True,
    batch_first=True,
)
fields = [("words", text_field), ("target", label_field)]
fields_test = [("words", text_field)]

train, valid = TabularDataset.splits(
    path="data",
    train="train_rnn.csv",
    validation="valid_rnn.csv",
    format="CSV",
Example #28
0
    def test_xnli(self):
        batch_size = 4

        # create fields
        TEXT = Field()
        GENRE = LabelField()
        LABEL = LabelField()
        LANGUAGE = LabelField()

        # create val/test splits, XNLI does not have a test set
        val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE)

        # check both are XNLI datasets
        assert type(val) == type(test) == XNLI

        # check all have the correct number of fields
        assert len(val.fields) == len(test.fields) == 5

        # check fields are the correct type
        assert type(val.fields['premise']) == Field
        assert type(val.fields['hypothesis']) == Field
        assert type(val.fields['label']) == LabelField
        assert type(val.fields['genre']) == LabelField
        assert type(val.fields['language']) == LabelField

        assert type(test.fields['premise']) == Field
        assert type(test.fields['hypothesis']) == Field
        assert type(test.fields['label']) == LabelField
        assert type(test.fields['genre']) == LabelField
        assert type(test.fields['language']) == LabelField

        # check each is the correct length
        assert len(val) == 37350
        assert len(test) == 75150

        # build vocabulary
        TEXT.build_vocab(val)
        LABEL.build_vocab(val)
        GENRE.build_vocab(val)
        LANGUAGE.build_vocab(val)

        # ensure vocabulary has been created
        assert hasattr(TEXT, 'vocab')
        assert hasattr(TEXT.vocab, 'itos')
        assert hasattr(TEXT.vocab, 'stoi')

        # create iterators
        val_iter, test_iter = Iterator.splits((val, test),
                                              batch_size=batch_size)

        # get a batch to test
        batch = next(iter(val_iter))

        # split premise and hypothesis from tuples to tensors
        premise = batch.premise
        hypothesis = batch.hypothesis
        label = batch.label
        genre = batch.genre
        language = batch.language

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(label) == torch.Tensor
        assert type(genre) == torch.Tensor
        assert type(language) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert label.shape[-1] == batch_size
        assert genre.shape[-1] == batch_size
        assert language.shape[-1] == batch_size

        # xnli cannot use the iters method, ensure raises error
        with self.assertRaises(NotImplementedError):
            val_iter, test_iter = XNLI.iters(batch_size=batch_size)

        # remove downloaded xnli directory
        shutil.rmtree('.data/xnli')
Example #29
0
from multiprocessing import set_start_method

try:
    set_start_method("spawn")
except RuntimeError:
    pass

flor.flags.NAME = "kaggle-nlp-disasters-rnn"
flor.flags.REPLAY = False

device = flor.log(
    "device", torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
device

label_field = Field(sequential=False,
                    use_vocab=False,
                    batch_first=True,
                    dtype=torch.float)
text_field = Field(tokenize="spacy",
                   lower=True,
                   include_lengths=True,
                   batch_first=True)
fields = [("words", text_field), ("target", label_field)]
fields_test = [("words", text_field)]

train, valid = TabularDataset.splits(
    path="data",
    train="train_rnn.csv",
    validation="valid_rnn.csv",
    format="CSV",
    fields=fields,
    skip_header=True,
Example #30
0
class Preprocessing(Dataset):

    __tokPattern = r"""[0-9A-Za-z_]*[A-Za-z_-]+[0-9A-Za-z_]*|\.|\!|\?|\d+|\-|%|[.,!?;'"]"""
    __supportedExtensions = ['txt', 'csv', 'json']
    __seedAttrs = ['nFirst', 'minFreq']

    def __init__(self,
                 fileParams={},
                 tokenizationOption='regex',
                 seedParams={
                     'nFirst': 1,
                     'minFreq': 5
                 },
                 fieldParams={
                     'lower': True,
                     'eos_token': '<!EOS!>'
                 },
                 spacyObj=None):

        self.__fileName, self.__fileExtension, self.__parsingColumn = checkFileParams(
            fileParams)
        self.__seedParams = checkSeedParams(seedParams)
        self.__DataVocab = Field(**fieldParams)
        self.__spacyObj = spacyObj
        self.__customTokenize = self.__tokenizationMethod(tokenizationOption)
        self.__readFile()

    @property
    def getFileName(self):
        return self.__fileName

    @property
    def getVocab(self):
        return self.__DataVocab

    def __readFile(self):
        text = readFiles(self.__fileName, self.__fileExtension,
                         self.__parsingColumn)
        self.examples = self.__getObjects(text)
        self.__seeds = getStartWords(self.__seedParams, text)
        self.__build_vocab()

    def __getObjects(self, text):
        self.fields = {"src": self.__DataVocab}
        return [Document(**self.__tokenize(instance)) for instance in text]

    def __build_vocab(self):
        self.__DataVocab.build_vocab(self)
        for instance in self.examples:
            instance.create_tokens(self.__DataVocab)

    def __regexTokenization(self, document):
        return re.findall(self.__tokPattern, document)

    def __nltkTokenization(self, document):
        return self.tokenizer(document)

    def __spacyTokenization(self, instance):
        return [
            entity.text.strip() for entity in self.__spacyObj(instance)
            if entity.text.strip()
        ]

    def __tokenize(self, instance):
        instance = self.__customTokenize(instance)
        return {'src': instance, 'trg': instance[1:]}

    @checkParams(str)
    def __tokenizationMethod(self, param):
        param = param.lower()

        if param == 'nltk':
            self.tokenizer = importNltk()
            return self.__nltkTokenization

        elif param == 'regex':
            return self.__regexTokenization

        elif param == 'spacy':
            if not self.__spacyObj:
                raise Exception(
                    "Please provide the spacy object to tokenize with.")

            return self.__spacyTokenization

        raise Exception(
            "The parameter 'tokenizationOption' can only be nltk, regex and spacy"
        )

    def getSeed(self):
        """
            return a weighted seed. 
            In case static seed is enabled, then the most frequent token will be the seed.
        """
        seeds = list(self.__seeds.keys())
        probs = list(self.__seeds.values())
        return choice(seeds, 1, probs).tolist()