Esempio n. 1
0
    def __init__(self, file_path, val, aspect, vocab=None):
        # load the beer review data
        print('Load beer data for aspect {}'.format(aspect))

        # iterate through the envs
        self.envs = []
        all_words = []
        self.length = 0

        for i in range(4):
            if i == 2:
                # choose validation env
                if val == 'in_domain':
                    data, words = BeerReview.load_json(
                        os.path.join(
                            file_path,
                            'art_aspect_{}_env_1_val.json'.format(aspect)))
                else:
                    data, words = BeerReview.load_json(
                        os.path.join(
                            file_path,
                            'art_aspect_{}_env_2_val.json'.format(aspect)))
            elif i == 3:  # test env
                data, words = BeerReview.load_json(
                    os.path.join(file_path,
                                 'art_aspect_{}_env_2.json'.format(aspect)))
            else:
                data, words = BeerReview.load_json(
                    os.path.join(file_path,
                                 'art_aspect_{}_env_{}.json'.format(aspect,
                                                                    i)))

            self.envs.append(data)

            all_words.extend(words)
            self.length += len(data['y'])

        if vocab is not None:
            self.vocab = vocab
        else:
            path = './wiki.en.vec'
            if not os.path.exists(path):
                # Download the word vector and save it locally:
                print('Downloading word vectors')
                import urllib.request
                urllib.request.urlretrieve(
                    'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec',
                    path)

            # get word embeddings from fasttext
            vectors = Vectors('wiki.en.vec', cache='vector_cache')
            self.vocab = Vocab(Counter(all_words),
                               vectors=vectors,
                               specials=[
                                   '<pad>', '<unk>', '<art_negative>',
                                   '<art_positive>'
                               ],
                               min_freq=5)

            # randomly initalize embeedings for the spurious tokens
            self.vocab.vectors[self.vocab.stoi['<art_negative>']] = torch.rand(
                300)
            self.vocab.vectors[self.vocab.stoi['<art_positive>']] = torch.rand(
                300)

            # print word embedding statistics
            wv_size = self.vocab.vectors.size()
            print('Total num. of words: {}, word vector dimension: {}'.format(
                wv_size[0], wv_size[1]))

            num_oov = wv_size[0] - torch.nonzero(torch.sum(
                torch.abs(self.vocab.vectors), dim=1),
                                                 as_tuple=False).size()[0]
            print(('Num. of out-of-vocabulary words'
                   '(they are initialized to zeros): {}').format(num_oov))

        # not evaluating worst-case performance for beer
        self.val_att_idx_dict = None
        self.test_att_idx_dict = None
Esempio n. 2
0
 def build_vocab(self, counter, **kwargs):
     return Vocab(counter, specials=self.specials, **kwargs)
Esempio n. 3
0
    def preprocess(self, ex, is_test):

        src_session = []
        segs_session = []
        txt_session = []

        session = ex["session"]
        dialogue = ex["dialogue"]
        topic_info = ex["topic_info"]

        if "summary" in ex.keys():
            tgt = ex["summary"]["id"][:self.args.max_tgt_len][:-1] + [2]
            tgt_txt = ex["summary"]["content_tokens"]
            if "ex_labels" in ex["summary"].keys():
                tgt_labels = ex["summary"]["ex_labels"]
                topic_summ_info = ex["summary"]["topic_summ_info"]
                if len(tgt_labels) == 0:
                    return None
            else:
                tgt_labels, topic_summ_info = None, None
        else:
            tgt, tgt_txt, tgt_labels, topic_summ_info = None, None, None, None

        src_ex = dialogue['src_id']
        segs_ex = dialogue['segs']

        end_id = [src_ex[-1]]
        src_ex = src_ex[:-1][:self.args.max_pos - 1] + end_id
        segs_ex = segs_ex[:self.args.max_pos]

        if self.args.copy_attn:

            # build dynamic dict
            ex_vocab = Vocab(Counter(src_ex), specials=[0])

            src_map = [ex_vocab.stoi[w] for w in src_ex]

            if tgt is not None:
                align = [0] + [
                    ex_vocab.stoi[w] if w in ex_vocab.stoi.keys() else 0
                    for w in tgt[1:-1]
                ] + [0]
            else:
                align = None

        for turn in session:
            index = turn['index']
            src = turn['src_id']
            segs = turn['segs']
            original_txt = turn['original_txt']
            role = turn['role']
            end_id = [src[-1]]
            src = src[:-1][:self.args.max_pos - 1] + end_id
            segs = segs[:self.args.max_pos]
            if role == '客服':
                original_txt = "(" + str(index) + ') 【客服】 ' + original_txt
            else:
                original_txt = "(" + str(index) + ') 【客户】 ' + original_txt

            src_session.append(src)
            segs_session.append(segs)
            txt_session.append(original_txt)

        if self.args.copy_attn:
            return src_ex, segs_ex, src_session, segs_session, txt_session, \
                tgt, tgt_txt, tgt_labels, topic_info, topic_summ_info, src_map, align, ex_vocab
        else:
            return src_ex, segs_ex, src_session, segs_session, txt_session, \
                tgt, tgt_txt, tgt_labels, topic_info, topic_summ_info
Esempio n. 4
0
    def _parse_data(self, data_dir):
        # Should parse data in the data_dir, create two dataframes with the format specified in
        # __init__(), and set all the variables so that run.ipynb run as it is.
        #
        # NOTE! I strongly suggest that you create multiple functions for taking care
        # of the parsing needed here. Avoid create a huge block of code here and try instead to
        # identify the seperate functions needed.

        def get_paths(top):
            '''Returns a list of paths to all xml files in all sub folders of the input top folder.'''

            # Walk through data dirs to collect paths to train data
            train_paths = []
            for directory in next(os.walk(os.path.join(data_dir, 'Train')))[1]:
                for file in next(
                        os.walk(os.path.join(data_dir, 'Train',
                                             directory)))[2]:
                    if file.endswith('.xml'):
                        train_paths.append(
                            os.path.join(data_dir, 'Train', directory, file))

            # Reserve 10% as val data
            random.shuffle(train_paths)
            val_ix = len(train_paths) // 10
            val_paths = train_paths[:val_ix]
            train_paths = train_paths[val_ix:]

            # Repeat for test data
            test_paths = []
            for directory in next(
                    os.walk(
                        os.path.join(data_dir, 'Test',
                                     'Test for DrugNER task')))[1]:
                for file in next(
                        os.walk(
                            os.path.join(data_dir, 'Test',
                                         'Test for DrugNER task',
                                         directory)))[2]:
                    if file.endswith('.xml'):
                        test_paths.append(
                            os.path.join(data_dir, 'Test',
                                         'Test for DrugNER task', directory,
                                         file))

            return train_paths, val_paths, test_paths

        def update_data_df(sentence, data_df, max_len, pos_tags):
            '''Updates metadata from an etree sentence element'''
            spacy_parsed = nlp(sentence.attrib['text'])
            tokenized = [token.text for token in spacy_parsed]
            max_len = max(len(tokenized), max_len)

            #update dataframe
            for token in spacy_parsed:
                new_data_row = pd.Series([
                    sentence.get('id'), token.text,
                    int(token.idx),
                    int(token.idx + len(token.text)), splt
                ])
                data_df = data_df.append(new_data_row, ignore_index=True)
                pos_tags.append(token.tag_)

            return data_df, max_len, pos_tags

        def update_ner_df(sentence, ner_df):
            '''Updates ner metadata from an etree sentence elment'''
            entities = sentence.findall('entity')

            for ent in entities:
                ner_type = ent.get('type')
                ner_spans = ent.get('charOffset').split(';')

                #update dataframe
                for span in ner_spans:
                    ner_char_start_id = span.split('-')[0]
                    ner_char_end_id = span.split('-')[1]
                    new_ner_row = pd.Series([
                        sentence.get('id'), ner_type,
                        int(ner_char_start_id),
                        int(ner_char_end_id)
                    ])
                    ner_df = ner_df.append(new_ner_row, ignore_index=True)

            return ner_df

        print('Initializing...')

        pos_tags = []
        max_len = 0

        data_df = pd.DataFrame()
        ner_df = pd.DataFrame()

        nlp = spacy.load('en_core_sci_md')

        #get files to process
        train_paths, val_paths, test_paths = get_paths(data_dir)

        #parse xml
        for paths in [train_paths, val_paths, test_paths]:
            splt = 'Train' if paths == train_paths else 'Val' if paths == val_paths else 'Test'
            print('Processing {} data...'.format(splt))
            for file in paths:
                tree = ET.parse(file)
                root = tree.getroot()

                for sentence in root.findall('sentence'):
                    data_df, max_len, pos_tags = update_data_df(
                        sentence, data_df, max_len, pos_tags)
                    ner_df = update_ner_df(sentence, ner_df)

        # Finalize data_df & vocab
        data_df.columns = [
            "sentence_id", "token_id", "char_start_id", "char_end_id", "split"
        ]
        counter = Counter(
            data_df.token_id)  #token_id are actual tokens at this point
        vocab = Vocab(counter)
        word2id = vocab.stoi
        id2word = vocab.itos
        data_df.token_id = [word2id[w]
                            for w in data_df.token_id]  #convert tokens to ids

        # Finalize ner_df
        ner_df.columns = [
            "sentence_id", "ner_id", "char_start_id", "char_end_id"
        ]
        ner2id = {'NEG': 0, 'drug': 1, 'drug_n': 2, 'brand': 3, 'group': 4}
        id2ner = {i: n for (n, i) in ner2id.items()}
        ner_df.ner_id = [ner2id[w]
                         for w in ner_df.ner_id]  #convert entities to ids

        # set variables
        setattr(self, 'data_df', data_df)
        setattr(self, 'ner_df', ner_df)
        setattr(self, 'vocab', id2word)
        setattr(self, 'id2ner', id2ner)
        setattr(self, 'max_sample_length', max_len)
        setattr(self, 'id2word', id2word)
        setattr(self, 'pos_tags', pos_tags)

        print('Done!')
Esempio n. 5
0
def make_corpra_vocab(logger,
                      tokenizer,
                      vectors_cache=None,
                      min_freq=None,
                      corpra_cache=None,
                      corpra_object=None,
                      corpus_type=None):
    """A helper function to create torchtext vocab objects from benchmark texts.
    Combines pre-trained embedding vectors with torch objects.

    Args:
        corpra_cache: a list of os paths to corpra, optional. If not provided, a torch vocab (corpra_object) is required.
        logger: a logging object
        tokenizer: a torchtext tokenizer object
        vectors_cache: an os path to the pre-trained embedding
        min_freq: an integer such as 1 or 5, If none, value is 1.
        corpra_type: a string, name of the corpus


    Returns: v, a torchtext objecting having global vocabulary, lookup tables, and embedding layers

    """
    logger.info(
        'Starting to parse corpra into vocab and iterable objects. This may take a while.'
    )
    corpra = {}
    counter = Counter()
    min_freq = 1 if min_freq is None else min_freq
    logger.info(f'Loading vectors from {vectors_cache}.')
    vectors = Vectors(vectors_cache)

    # forcing imdb to run from corpus object
    if corpra_cache is not None and 'imdb' not in corpus_type:

        for corpus_cache in corpra_cache:
            logger.info(f'Reading corpus cache from {corpus_cache}')
            key = 'train' if '.train.' in corpus_cache else 'test' if '.test.' in corpus_cache else 'valid'
            corpus = []
            f = open(corpus_cache, 'r')
            logger.info(f'Tokenizing and making vocabulary for {key} set.')
            for line in f:
                counter.update(tokenizer(line))
                corpus.extend(tokenizer(line))
            corpra.update({key: corpus})

    elif corpra_object is not None:

        def corpra_key(x, o):
            if len(o) == 2:
                return 'train' if x == 0 else 'test'
            else:
                return 'train' if x == 0 else 'valid' if x == 1 else 'test'

        for idx, corpus_object in enumerate(corpra_object):
            key = corpra_key(idx, corpra_object)
            corpus = []
            logger.info(f'Tokenizing and making vocabulary for {key} set.')

            if corpus_type == 'imdb':

                for line in corpus_object:
                    tokens = tokenizer(line[1])
                    counter.update(tokens)
                    labels_tokens = tuple((line[0], tokens))
                    corpus.append(labels_tokens)
                corpra.update({key: corpus})
            else:

                for line in corpus_object:
                    counter.update(tokenizer(line))
                    corpus.extend(tokenizer(line))
                corpra.update({key: corpus})

    v = Vocab(counter,
              min_freq=min_freq,
              vectors=vectors,
              vectors_cache=vectors_cache)

    text_pipeline = lambda x: [v[token] for token in tokenizer(x)]
    label_code = lambda x: 0 if x == 'neg' else 1 if x == 'pos' else 2
    corpra_numeric = {}
    corpra_labels = {}

    for data_set, corpus in corpra.items():
        logger.info(
            f'Converting string tokens to numeric tokens for {data_set}.')
        corpus_numeric = []
        corpus_labels = {}
        if corpus_type == "imdb":
            for idx, line in enumerate(corpus):
                tokens = str(line[1])
                label = torch.tensor(label_code(str(line[0])),
                                     dtype=torch.long)
                numeric_tokens = torch.tensor(text_pipeline(tokens),
                                              dtype=torch.long)
                # labels_tokens = tuple((label, numeric_tokens))
                corpus_numeric.append(numeric_tokens)
                # idx_labels = tuple((idx, label))
                corpus_labels.update({idx: label})

            corpra_numeric.update({data_set: corpus_numeric})
            corpra_labels.update({data_set: corpus_labels})

        else:

            for line in corpus:
                numeric_tokens = text_pipeline(line)
                corpus_numeric.extend(numeric_tokens)

            corpus_numeric = torch.tensor(corpus_numeric, dtype=torch.long)
            corpra_numeric.update({data_set: corpus_numeric})

    logger.info(
        f'Generated torch Vocab object with dictionary size of {len(v.stoi)}.')

    random_word = random.choice(v.itos)
    random_word_index = v.stoi[random_word]
    random_word_curr_vector = v.vectors[random_word_index]
    random_word_orig_vector = vectors[random_word]
    # the torch vocab object has mapped the vocab index to the embedding layer
    assert random_word_curr_vector.all() == random_word_orig_vector.all()

    if corpus_type == 'imdb':
        return v, corpra_numeric, corpra_labels
    else:
        return v, corpra_numeric
Esempio n. 6
0
def build_vocab(filepath, tokenizer):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
Esempio n. 7
0
#print(train_data.shape)

train_data = train_data.sort_values(by="Name", key=lambda x: x.str.len())
test_data = test_data.sort_values(by="Name", key=lambda x: x.str.len())

max_length_test = len(test_data.iloc[-1]['Name'])
max_length_train = len(train_data.iloc[-1]['Name'])
max_length = max(max_length_test, max_length_train)

unique = list(set("".join(train_data.iloc[:,0])))
unique.sort()
vocab = dict(zip(unique, range(1,len(unique)+1)))

tokenizer = get_tokenizer('basic_english')

vocab_new = Vocab(vocab,specials=())

def data_process(raw_text_iter,max_len=128):
  batch = []
  for item in raw_text_iter:
    res = []
    for i in range(max_len):
      if (len(item)>i):
        res.extend([vocab_new[token] for token in tokenizer(item[i])])
      else:
        res.extend([0])
    batch.append(res)
  pad_data = torch.FloatTensor(batch)
  return pad_data

import numpy as np
Esempio n. 8
0
 def build_vocab(self) -> None:
     specials = [REDUCE, SHIFT]
     for nonterm in self.nonterm_field.vocab.stoi:
         specials.append(NT(nonterm))
     self.vocab = Vocab(Counter(), specials=specials)
Esempio n. 9
0
def _build_fields_vocab(fields,
                        counters,
                        data_type,
                        share_vocab,
                        vocab_size_multiple,
                        src_vocab_size,
                        src_words_min_frequency,
                        conv_vocab_size,
                        conv_words_min_frequency,
                        tgt_vocab_size,
                        tgt_words_min_frequency,
                        subword_prefix="▁",
                        subword_prefix_is_joiner=False):
    build_fv_args = defaultdict(dict)
    build_fv_args["src"] = dict(max_size=src_vocab_size,
                                min_freq=src_words_min_frequency)
    build_fv_args["conv"] = dict(max_size=conv_vocab_size,
                                 min_freq=conv_words_min_frequency)
    build_fv_args["tgt"] = dict(max_size=tgt_vocab_size,
                                min_freq=tgt_words_min_frequency)
    tgt_multifield = fields["tgt"]
    _build_fv_from_multifield(
        tgt_multifield,
        counters,
        build_fv_args,
        size_multiple=vocab_size_multiple if not share_vocab else 1)

    if fields.get("corpus_id", False):
        fields["corpus_id"].vocab = fields["corpus_id"].vocab_cls(
            counters["corpus_id"])

    if data_type == 'text':
        src_multifield = fields["src"]
        _build_fv_from_multifield(
            src_multifield,
            counters,
            build_fv_args,
            size_multiple=vocab_size_multiple if not share_vocab else 1)

        conv_multifield = fields["conv"]
        _build_fv_from_multifield(
            conv_multifield,
            counters,
            build_fv_args,
            size_multiple=vocab_size_multiple if not share_vocab else 1)

        src_field = src_multifield.base_field
        conv_field = conv_multifield.base_field
        tgt_field = tgt_multifield.base_field

        if share_vocab:
            # `tgt_vocab_size` is ignored when sharing vocabularies
            logger.info(" * merging src and tgt vocab...")

            _merge_field_vocabs(src_field,
                                conv_field,
                                tgt_field,
                                vocab_size=src_vocab_size,
                                min_freq=src_words_min_frequency,
                                vocab_size_multiple=vocab_size_multiple)
            logger.info(" * merged vocab size: %d." % len(src_field.vocab))
        else:
            logger.info(" * merging src and conv vocab...")
            merged = sum([
                Counter(dict(
                    src_field.vocab.freqs.most_common(src_vocab_size))),
                Counter(
                    dict(conv_field.vocab.freqs.most_common(conv_vocab_size)))
            ], Counter())
            merged_vocab = Vocab(merged)
            src_field.vocab = merged_vocab
            conv_field.vocab = merged_vocab

        build_noise_field(src_multifield.base_field,
                          subword_prefix=subword_prefix,
                          is_joiner=subword_prefix_is_joiner)
    return fields
Esempio n. 10
0
def build_vocab(counter):
    from torchtext.vocab import Vocab
    vocab = Vocab(counter=counter, specials=[], vectors=None)
    return vocab
def _dynamic_dict(example,
                  ques_field,
                  ans_field,
                  tgt_field,
                  max_par_arc_size=20):
    """Create copy-vocab and numericalize with it.

    In-place adds ``"src_map"`` to ``example``. That is the copy-vocab
    numericalization of the tokenized ``example["src"]``. If ``example``
    has a ``"tgt"`` key, adds ``"alignment"`` to example. That is the
    copy-vocab numericalization of the tokenized ``example["tgt"]``. The
    alignment has an initial and final UNK token to match the BOS and EOS
    tokens.

    Args:
        example (dict): An example dictionary with a ``"src"`` key and
            maybe a ``"tgt"`` key. (This argument changes in place!)
        src_field (torchtext.data.Field): Field object.
        tgt_field (torchtext.data.Field): Field object.

    Returns:
        torchtext.data.Vocab and ``example``, changed as described.
    """
    #print(example.keys())
    #print('ans', example["ans"])
    #print('ques', example["ques"])
    ans = ans_field.tokenize(example["ans"])
    if isinstance(example["ques"], str):
        ques = ques_field.tokenize(example["ques"])
    else:
        # confnet
        ques = example["ques"]
        ques_weights = example["score"]
        #example["ques"] = [ques, ques_weights]

    if isinstance(example["ans"], str):
        ans = ans_field.tokenize(example["ans"])
    else:
        # confnet
        ans = example["ans"]

    # make a small vocab containing just the tokens in the source sequence
    unk = ans_field.unk_token
    pad = ans_field.pad_token
    assert unk == ques_field.unk_token
    assert pad == ques_field.pad_token
    """
    if isinstance(example["ans"], str):
        src_ex_vocab = Vocab(Counter([w for par_arcs in ques for w in par_arcs ] + ans), specials=[unk, pad])
        ans_map = [src_ex_vocab.stoi[w] for w in ans]
    else:
        src_count = Counter([w for par_arcs in ques for w in par_arcs] + [w for par_arcs in ans for w in par_arcs])
        src_ex_vocab = Vocab(src_count, specials=[unk, pad])
        ans_map = torch.LongTensor([[src_ex_vocab.stoi[w] for w in par_arcs] for par_arcs in ques])

    if isinstance(example["ques"], str):
        if isinstance(example["ans"], str):
            src_ex_vocab = Vocab(Counter(ques+ans), specials=[unk, pad])
            ques_map = [src_ex_vocab.stoi[w] for w in ques]
            ans_map = [src_ex_vocab.stoi[w] for w in ques]

        elif isinstance(example["ans"], list):
            src_count = Counter(ques + [w for par_arcs in ans for w in par_arcs])
            src_ex_vocab = Vocab(src_count, specials=[unk, pad])

            ques_map = [[src_ex_vocab.stoi[w]]+[pad]*(max_par_arc_size-1) for w in ques]
            ans_map = [[src_ex_vocab.stoi[w] for w in par_arcs]+[pad]*(max_par_arc_size-len(par_arcs)) for par_arcs in ans]

    elif isinstance(example["ques"], list):
        if isinstance(example["ans"], str):
            src_count = Counter(ans + [w for par_arcs in ques for w in par_arcs])
            src_ex_vocab = Vocab(src_count, specials=[unk, pad])
            temp_map = [src_ex_vocab.stoi[w] for w in ans]
            ans_map = [[src_ex_vocab.stoi[w]] + [src_ex_vocab.stoi[pad]] * (max_par_arc_size - 1) for w in ans]
            ques_map = [[src_ex_vocab.stoi[w] for w in par_arcs] + [src_ex_vocab.stoi[pad]] * (max_par_arc_size - len(par_arcs)) for
                       par_arcs in ques]

            ans_map_weights = [[1.0] + [0] * (max_par_arc_size - 1) for w in ans]
            ques_map_weights =  [[w for w in par_arcs] + [0] * (max_par_arc_size - len(par_arcs)) for
                       par_arcs in ques_weights]
    
        elif isinstance(example["ans"], list):
            src_count = Counter([w for par_arcs in ques for w in par_arcs] + [w for par_arcs in ans for w in par_arcs])
            src_ex_vocab = Vocab(src_count, specials=[unk, pad])
            ans_map = [[src_ex_vocab.stoi[w] for w in par_arcs] + [pad] * (max_par_arc_size - len(par_arcs)) for par_arcs in ans]
            ques_map = [[src_ex_vocab.stoi[w] for w in par_arcs] + [pad] * (max_par_arc_size - len(par_arcs)) for
                        par_arcs in ques]
    """

    src_count = Counter(ans + [w for par_arcs in ques for w in par_arcs])
    src_ex_vocab = Vocab(src_count, specials=[unk, pad])
    ans_map = [[src_ex_vocab.stoi[w]] + [src_ex_vocab.stoi[pad]] *
               (max_par_arc_size - 1) for w in ans]
    ques_map = [[src_ex_vocab.stoi[w] for w in par_arcs] +
                [src_ex_vocab.stoi[pad]] * (max_par_arc_size - len(par_arcs))
                for par_arcs in ques]

    ans_map_weights = [[1.0] + [0] * (max_par_arc_size - 1) for w in ans]
    ques_map_weights = [[w for w in par_arcs] + [0] *
                        (max_par_arc_size - len(par_arcs))
                        for par_arcs in ques_weights]

    unk_idx = src_ex_vocab.stoi[unk]
    # Map source tokens to indices in the dynamic dict.
    src_map = torch.cat(
        (torch.LongTensor(ques_map), torch.LongTensor(ans_map)), dim=0)
    src_map_weights = torch.cat((torch.FloatTensor(ques_map_weights),
                                 torch.FloatTensor(ans_map_weights)),
                                dim=0)
    example["src_map"] = [src_map, src_map_weights]
    example["src_ex_vocab"] = src_ex_vocab

    if "tgt" in example:
        tgt = tgt_field.tokenize(example["tgt"])
        mask = torch.LongTensor([unk_idx] +
                                [src_ex_vocab.stoi[w]
                                 for w in tgt] + [unk_idx])
        example["alignment"] = mask
    return src_ex_vocab, example
Esempio n. 12
0
import torch.optim as optim
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from config import *
from utils import read_conll_sentence, prepare_dataset, train, evaluate
from models import BiLSTMTagger, BiLSTMCRFTagger

if __name__ == '__main__':
    # load a list of sentences, where each word in the list is a tuple containing the word and the label
    train_data = list(read_conll_sentence(TRAIN_DATA))
    train_word_counter = Counter(
        [word for sent in train_data for word in sent[0]])
    train_label_counter = Counter(
        [label for sent in train_data for label in sent[1]])
    word_vocab = Vocab(train_word_counter, specials=(UNK, PAD), min_freq=2)
    label_vocab = Vocab(train_label_counter, specials=(), min_freq=1)
    train_data = prepare_dataset(train_data, word_vocab, label_vocab)
    print('Train word vocab:', len(word_vocab), 'symbols.')
    print('Train label vocab:', len(label_vocab),
          f'symbols: {list(label_vocab.stoi.keys())}')
    valid_data = list(read_conll_sentence(VALID_DATA))
    valid_data = prepare_dataset(valid_data, word_vocab, label_vocab)
    print('Train data:', len(train_data), 'sentences.')
    print('Valid data:', len(valid_data))

    print(' '.join([word_vocab.itos[i.item()] for i in train_data[0][0]]))
    print(' '.join([label_vocab.itos[i.item()] for i in train_data[0][1]]))

    print(' '.join([word_vocab.itos[i.item()] for i in valid_data[1][0]]))
    print(' '.join([label_vocab.itos[i.item()] for i in valid_data[1][1]]))
Esempio n. 13
0
        next(reader)
        for text, title in reader:
            yield Example.fromlist([text, title], [('text', text_field),
                                                   ('title', text_field)])


if config['model']['embedding']['name'] == 'bpe':
    bpe = BPEmb(lang='ru', vs=VOCAB_SIZE - 1, dim=EMB_DIM, add_pad_emb=True)

    text_field = Field(init_token=SOS_TOKEN,
                       eos_token=EOS_TOKEN,
                       tokenize=bpe.encode,
                       pad_token=PAD_TOKEN,
                       include_lengths=True,
                       batch_first=True)
    text_field.vocab = Vocab(Counter(bpe.words))

    embedding = nn.Embedding.from_pretrained(
        torch.tensor(bpe.vectors, dtype=torch.float32))
    embedding.to(DEVICE)
elif config['model']['embedding']['name'] == 'embedding':
    text_field = Field(init_token=SOS_TOKEN,
                       eos_token=EOS_TOKEN,
                       pad_token=PAD_TOKEN,
                       include_lengths=True,
                       batch_first=True)
    train_dataset = TabularDataset(TRAIN_DATA_PATH,
                                   format='csv',
                                   fields=[('text', text_field),
                                           ('title', text_field)])
    train_iterator = BucketIterator(train_dataset,
Esempio n. 14
0
    def __init__(self,
                 meta_path,
                 text_path,
                 audio_path,
                 video_path,
                 train_set,
                 vocab_freq,
                 modality='v',
                 init_token='<s>',
                 eos_token='</s>',
                 pad_token='<pad>',
                 unk_token='<unk>',
                 max_len=50,
                 context='',
                 on_memory=True,
                 *args,
                 **kwargs):

        self.meta_path = meta_path
        self.text_path = text_path
        self.audio_path = audio_path
        self.video_path = video_path
        self.train_set = train_set
        self.vocab_min_freq = vocab_freq
        self.modality = modality
        self.max_len = max_len
        self.context = context
        self.on_memory = on_memory

        # text data
        self.meta_data = pd.read_csv(meta_path)

        # create vocab from data
        if not train_set:
            text_sentences = self.meta_data['text']
            caption_sentences = self.meta_data['caption']
            text_counter = collections.Counter()
            caption_counter = collections.Counter()

            for each in text_sentences:
                if not pd.isnull(each):
                    text_counter.update(each.lower().split())
            for each in caption_sentences:
                caption_counter.update(each.lower().split())
            # for each in text_counter:
            #     if text_counter[each] < vocab_freq:
            #         continue
            #     caption_counter[each] += text_counter[each]

            self.caption_vocab = Vocab(
                caption_counter,
                min_freq=1,
                specials=[unk_token, init_token, eos_token, pad_token])
            self.text_vocab = self.caption_vocab
            #self.text_vocab = Vocab(
            #    text_counter, min_freq=vocab_freq,
            #    specials=[unk_token, init_token, eos_token, pad_token]
            #)
            self.n_src_token = len(self.text_vocab)
            self.n_tgt_token = len(self.caption_vocab)

        # validation set should inherit the vocab from train set
        else:
            self.text_vocab = train_set.text_vocab
            self.caption_vocab = train_set.caption_vocab
            self.n_src_token = train_set.n_src_token
            self.n_tgt_token = train_set.n_tgt_token

        self.init_idx = self.text_vocab.stoi[init_token]
        self.eos_idx = self.text_vocab.stoi[eos_token]
        self.pad_idx = self.text_vocab.stoi[pad_token]
        self.unk_idx = self.text_vocab.stoi[unk_token]

        self.text_dataset = json.load(open(
            self.text_path)) if self.text_path else {}
        self.audio_dataset = AudioDataset(audio_path, self.pad_idx,
                                          self.max_len)
        self.video_dataset = VideoDataset(video_path, self.pad_idx,
                                          self.max_len)
Esempio n. 15
0
class SNLIDataset(data.Dataset):
    '''
    This is a dataset class for SLNI 1.0 data.
    The dataset contains pairs of sentences and the label is their inference of types ["neutral", "contradiction", "entailment"]
    Loading the data and process using spacy/Bert tokenizer
    There are two different sentence construction: once concatenated combined sentence or 2 separate sentences
    '''

    #constants
    LABELS=["neutral", "contradiction", "entailment"]
    SEP_TOKEN='<sep>'
    UNK_TOKEN='<unk>'
    PAD_TOKEN='<pad>'
    MAX_LEN=512
    GLOVE_EMBEDDINGS='glove.42B.300d'
    SPACY_TOKENIZER="en_core_web_sm"
    MAX_SIZE_VOCAB=10000
    MIN_FREQ_VOCAB=2



    def __init__(self, data_path, saved_dir, device, eng_mode='one_sentence', vocab_external=None, tokenized_datapoints_file=None, vocab_file=None):

        self.data_size=0
        #Load data
        self.datapoints=self.load_data(data_path)
        self.vocab = None
        #initial tokenizer is always spacy
        self.tokenizer_type = 'spacy'
        self.eng_mode = eng_mode
        self.vocab_file = vocab_file
        self.vocab_external=vocab_external
        self.device = device
        self.saved_dir = saved_dir
        #Prepare tokenizer
        self.change_tokenizer_and_vocab(tokenizer=self.tokenizer_type, eng_mode=eng_mode)
        #prepare in advance tokenized sentences for better performance.
        if tokenized_datapoints_file is None:
            self.tokenized_datapoints=self.prepare_tokenized_datapoints()
        else:
            self.tokenized_datapoints=utils.load_from_pickle(os.path.join(self.saved_dir, tokenized_datapoints_file))


        #self.spacy_tokenized_combined_sentences=
        #self.spacy_tokenized_two_sentences=

    def load_data(self, data_path):
        '''
        Load data from the dataset json file
        :param data_path: data path to dataset json file
        :return: list of tuples of form (sentence1, sentence2, label)
        '''
        all_datapoints= [] #{'sentence1':[], 'sentence2':[], 'label':[]}

        with io.open(data_path, encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                line = json.loads(line)
                #check that label is valid. otherwaise discard
                if line['gold_label'] in self.LABELS:
                    sentence1=line['sentence1']
                    sentence2 = line['sentence2']
                    label = line['gold_label']
                    all_datapoints.append((sentence1,sentence2, label))
                    self.data_size+=1

        return all_datapoints


    def shuffle_sort_datapoints(self):
        '''

        :return:
        '''

        len_datapoint = lambda datapoint: len(datapoint[0]) + len(datapoint[1])

        if self.tokenizer_type=='bert':
            random.shuffle(self.datapoints)
            self.datapoints = sorted(self.datapoints, key=len_datapoint)

        elif self.tokenizer_type=='spacy':
            random.shuffle(self.tokenized_datapoints)
            self.tokenized_datapoints = sorted(self.tokenized_datapoints, key=len_datapoint)


    def transform_label(self, string_label):

        return self.LABELS.index(string_label)

    def prepare_spacy_vocab(self, vocab_external=None, vocab_file=None):
        self.tokenizer_type = 'spacy'
        self.tokenizer = spacy.load(self.SPACY_TOKENIZER)
        if not vocab_external:
            if vocab_file is None:
                counter = Counter()
                print('Creating vocab')
                for ind in range(self.data_size):
                    sentence1 = self.datapoints[ind][0]
                    sentence2 = self.datapoints[ind][1]
                    list_sentence1 = [token.text for token in self.tokenizer(sentence1.lower())]
                    counter.update(list_sentence1)
                    list_sentence2 = [token.text for token in self.tokenizer(sentence2.lower())]
                    counter.update(list_sentence2)
                    utils.save_to_pickle(counter, os.path.join(self.saved_dir, "vocab_counter.pkl"))
            else:
                counter = utils.load_from_pickle(os.path.join(self.saved_dir, vocab_file))
            self.vocab = Vocab(counter, max_size=self.MAX_SIZE_VOCAB, min_freq=self.MIN_FREQ_VOCAB,
                               specials=[self.PAD_TOKEN, self.SEP_TOKEN, self.UNK_TOKEN])
            self.vocab.load_vectors(self.GLOVE_EMBEDDINGS, unk_init=torch.Tensor.random_)
        else:
            self.vocab = vocab_external




    def change_tokenizer_and_vocab(self, tokenizer='bert' ,eng_mode='one_sentence'):
        '''
        change tokenizer- Can be of form 'bert or 'spacy'. If spacy the eng structue can be of type one sentence (conctenated sentence) or two sentence structure
        :param tokenizer: 'bert' or 'spacy'
        :param eng_mode: 'one_sentence' or 'two_sentence'
        :return: None
        '''
        if tokenizer=='spacy':
            self.tokenizer=spacy.load(self.SPACY_TOKENIZER)# get_tokenizer('spacy', language='en')
            self.eng_mode=eng_mode
            self.tokenizer_type = 'spacy'
            self.prepare_spacy_vocab(self.vocab_external, self.vocab_file)
        else:
            self.tokenizer=BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
            self.tokenizer_type = 'bert'


    def prepare_tokenized_datapoints(self):
        '''
        Tokenize the full dataset according to the inner vocabulary
        :return: list containing items of structure (tokenized_sentence1,  tokenized_sentence2, label) of the full dataset
        '''

        tokenized_datapoints=[]
        for datapoint in self.datapoints:
            sentence1, sentence2, label=datapoint
            tokenized_sentence1 = [self.vocab.stoi[token.text] if token.text in self.vocab.stoi.keys() else self.vocab.stoi[self.UNK_TOKEN] \
                                   for token in list(self.tokenizer(sentence1.lower()))]
            tokenized_sentence2 = [self.vocab.stoi[token.text] if token.text in self.vocab.stoi.keys() else self.vocab.stoi[self.UNK_TOKEN]\
                                   for token in list(self.tokenizer(sentence2.lower()))]
            tokenized_datapoints.append((tokenized_sentence1,  tokenized_sentence2, label))

        return tokenized_datapoints

    def __getitem__(self, idx):
        '''
        get an iterm from the dataset. This is essentinal function of the dataset.Run by the dataloader to fetch the item
        We will fetch a different output depending on the state of the dataset. If it of Bert model it will be fetch untokenized. If spacy it will be fetched already tokenized.

        :param idx: index of the item
        :return: item that contain sentence 1, sentence 2, label
        '''

        if idx==0:
            self.shuffle_sort_datapoints()

        if self.tokenizer_type == 'bert':
            #Untokenized sentences
            return self.datapoints[idx]
        else:
            # tokenized spacy sentences
            return self.tokenized_datapoints[idx]


    def __len__(self):

        return self.data_size


    def prepare_eng_sentence(self, tokenized_sentence1,tokenized_sentence2, mode):
        #print(list(self.tokenizer(sentence1.lower())))

        if mode=='one_sentence':
            tokenized_combined_sentence=tokenized_sentence1+[self.vocab.stoi[self.SEP_TOKEN]]+tokenized_sentence2
            return torch.tensor(tokenized_combined_sentence, dtype=torch.long).to(self.device)
        else:
            tokenized_sentence1=tokenized_sentence1+[self.vocab.stoi[self.PAD_TOKEN]]*(self.MAX_LEN-len(tokenized_sentence1))
            tokenized_sentence2 = tokenized_sentence2 + [self.vocab.stoi[self.PAD_TOKEN]] * (self.MAX_LEN - len(tokenized_sentence2))
            return torch.tensor(tokenized_sentence1, dtype=torch.long).to(self.device), torch.tensor(tokenized_sentence2, dtype=torch.long).to(self.device)



    def change_from_external_voc(self, vocab, tokenizer_type, eng_mode):
        '''

        :param vocab:
        :param tokenizer_type:
        :param eng_mode:
        :return:
        '''
        self.vocab=vocab
        self.tokenizer_type=tokenizer_type
        self.eng_mode=eng_mode

    def collate_fun(self, batch):
        '''
        the collate function is run by the dataloader immediately after the data is fetched.We will pad the sequences fetched to the same length,
        in case of one sentence construction we will add a seperator token and pad the seqnce to the longest lenght in batch
        in case of two sentences construction, the two sentences are padd to the max lenght constnat value
        In case of Bert we call the bert tokenizer with the batch of inputs to prepare them for the Bert model
        :param batch:
        :return:
        '''

        #print(f"Start collate{time.time()}")
        batch_sentence1 = []
        batch_sentence2 = []
        batch_sentence_combined = []
        batch_labels = []
        sentence_masks=[]

        #Loop through the batch and prepare lists of inputs according to tokenizer type and mode. In case of spacy inputs are already tokenized
        for sentence1, sentence2, label in batch:
            batch_labels.append(torch.tensor(self.transform_label(label)))

            if self.tokenizer_type=='spacy':
                if self.eng_mode=='one_sentence':
                    combined_sentence=self.prepare_eng_sentence(sentence1,sentence2, mode=self.eng_mode)
                    batch_sentence_combined.append(combined_sentence)
                else:
                    senetence1_tokenized, sentence2_tokenized=self.prepare_eng_sentence(sentence1,sentence2, mode=self.eng_mode)
                    batch_sentence1.append(senetence1_tokenized)
                    batch_sentence2.append(sentence2_tokenized)
            if self.tokenizer_type=='bert':
                batch_sentence1.append(sentence1)
                batch_sentence2.append(sentence2)


        #Preparing the batch- Handle padding and  for bert also tokenization
        if self.tokenizer_type=='spacy':
                if self.eng_mode=='one_sentence':#Pad the sequence to the length of the longest in batch
                    prepared_batch_sentences= pad_sequence(batch_sentence_combined, batch_first=True, padding_value=self.vocab.stoi[self.PAD_TOKEN])
                    padding_mask=(prepared_batch_sentences==self.vocab.stoi[self.PAD_TOKEN])
                    prepared_batch = {'inputs': prepared_batch_sentences, 'labels': torch.stack(batch_labels),
                                      'attention_padding_mask': padding_mask}
                else:
                    prepared_sentences1 = pad_sequence(batch_sentence1, batch_first=True, padding_value=self.vocab.stoi[self.PAD_TOKEN])
                    prepared_sentences2 = pad_sequence(batch_sentence2, batch_first=True,  padding_value=self.vocab.stoi[self.PAD_TOKEN])
                    prepared_batch = {'inputs_1': prepared_sentences1, 'inputs_2':  prepared_sentences2, 'labels': torch.stack(batch_labels)}
        else:
            #If Bert call the Bert tokenizer the prepares the batch of inputs to th longest length and add separator and classification tokens and masks
            tokenized_sentences=self.tokenizer(batch_sentence1,batch_sentence2,padding='longest', add_special_tokens=True,return_tensors="pt")
            prepared_batch = {'inputs_ids': tokenized_sentences['input_ids'],
                          'token_type_ids': tokenized_sentences['token_type_ids'],
                          'attention_mask': tokenized_sentences['attention_mask'], 'labels': torch.stack(batch_labels)}

        #print(f"End collate{time.time()}")
        return prepared_batch
Esempio n. 16
0
def get_vocab(dset):
    counter = Counter()
    tokenizer = get_tokenizer('basic_english')
    for (label, text) in dset:
        counter.update(tokenizer(text))
    return Vocab(counter, min_freq=1), tokenizer
Esempio n. 17
0
def main(path, test=False):
    # Загрузка данных
    cat_dict = {}
    if test:
        print('Режим тестирования')
        cat_dict = pickle.load(open('cat_dict', 'rb'))
        train_df = fn.loading_data(path, cat_dict)
    else:
        train_df, cat_dict = fn.loading_data(path, cat_dict)
    # Загрузка дополнительно размеченных данных
    # dop_data_path = 'data/dop_data.parquet'
    # pd.read_excel(r'C:\Users\Женечка\Desktop\Ручная разметка данных\Не размеченные данные.xlsx')[[
    #     'category_id', 'item_name']].dropna().drop_duplicates().to_parquet(dop_data_path, index=False)
    # dop_df = fn.loading_data(dop_data_path, cat_dict)
    # Предварительныая обработка данных
    train_df.item_name = fn.add_ed_izm(train_df[['item_name']]).item_name
    # dop_df.item_name = fn.add_ed_izm(dop_df[['item_name']]).item_name
    # print(list(train_df), list(dop_df))
    # print(dop_df.category_id.map(cat_dict).dropna())
    # dop_df['category_id_new'] = dop_df.category_id.map(cat_dict).astype(int)
    # dop_df = dop_df[['category_id_new', 'item_name']]
    train_df = train_df[train_df.category_id != -1].drop_duplicates(
        subset=['item_name', 'category_id'])
    # train_data = np.vstack((train_df[['category_id_new', 'item_name']].to_numpy(),
    #                         dop_df.to_numpy()))
    train_data = train_df[['category_id_new', 'item_name']].to_numpy()

    tokenizer = get_tokenizer('basic_english')
    counter = Counter()
    for label, line in train_data:
        counter.update(tokenizer(line))
    vocab = Vocab(counter, min_freq=1)

    text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
    label_pipeline = lambda x: int(x)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def collate_batch(batch):
        label_list, text_list, offsets = [], [], [0]
        for (_label, _text) in batch:
            label_list.append(label_pipeline(_label))
            processed_text = torch.tensor(text_pipeline(_text),
                                          dtype=torch.int64)
            text_list.append(processed_text)
            offsets.append(processed_text.size(0))
        label_list = torch.tensor(label_list, dtype=torch.int64)
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
        return label_list.to(device), text_list.to(device), offsets.to(device)

    dataloader = DataLoader(train_data,
                            batch_size=8,
                            shuffle=False,
                            collate_fn=collate_batch)

    num_class = len(cat_dict)
    vocab_size = len(vocab)
    emsize = 64
    model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

    def train(dataloader):
        model.train()
        total_acc, total_count = 0, 0
        log_interval = 500
        start_time = time.time()
        for idx, (label, text, offsets) in enumerate(dataloader):
            optimizer.zero_grad()
            predited_label = model(text, offsets)
            #         print(sorted(label))
            loss = criterion(predited_label, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
            optimizer.step()
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches '
                      '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                                  total_acc / total_count))
                total_acc, total_count = 0, 0
                start_time = time.time()

    def evaluate(dataloader):
        model.eval()
        total_acc, total_count = 0, 0

        with torch.no_grad():
            for idx, (label, text, offsets) in enumerate(dataloader):
                predited_label = model(text, offsets)
                loss = criterion(predited_label, label)
                total_acc += (predited_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
        return total_acc / total_count

    from torch.utils.data.dataset import random_split
    # Hyperparameters
    EPOCHS = 25  # epoch
    LR = 5  # скорость обучения
    BATCH_SIZE = 16  # batch size for training

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    total_accu = None
    train_dataset = list(train_data)
    num_train = int(len(train_dataset) * 0.9)
    split_train_, split_valid_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    # print(train_dataset)
    train_dataloader = DataLoader(split_train_,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=collate_batch)

    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        accu_val = evaluate(valid_dataloader)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)

    # # Сохранение тестовых данных
    # if ~test:
    #     pickle.dump(train_data.item_name, open('X_test', 'wb'))
    #     pickle.dump(train_data.category_id_new, open('y_test', 'wb'))

    # Сохранение моделей
    if ~test:
        print('Сохранение фалов модели')
        pickle.dump(cat_dict, open('cat_dict', 'wb'))
        pickle.dump(vocab, open('vocab', 'wb'))
        pickle.dump(tokenizer, open('tokenizer', 'wb'))
        torch.save(model.state_dict(), 'model')
Esempio n. 18
0
import collections

import gensim
from torchtext.vocab import Vectors, Vocab

model = gensim.models.KeyedVectors.load_word2vec_format('input/vector.bin',
                                                        binary=True)
print(model['中国'])

# 肉眼可读方式存储的word2vec
vectors = Vectors(word_vector, cache=wv_path)
vocab = Vocab(collections.Counter(words),
              vectors=vectors,
              specials=['<pad>', '<unk>'],
              min_freq=1)
wv_size = vocab.vectors.size()
vocab.stoi['<unk>']
Esempio n. 19
0
def main():
    print(torch.__version__)
    cudnn.benchmark = False
    device = torch.device('cuda:{}'.format(params['gpu'][0]))
	
    cur_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
    logdir = os.path.join(params['log'], cur_time)
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    writer = SummaryWriter(log_dir=logdir)
    #convert_csv(params['dataset'])
    print('Loading dataset')
    train_transform = create_train_transform(False,True,True,True, size=params['size'], bright=False)
    train_dataset = VideoDataset(params['dataset'],size=params['size'], mode='train', play_time=params['clip_len'],frame_sample_rate=params['frame_sample_rate'], transform=train_transform, sub_classnum = params['num_classes'], label_num = params['label_num'], stride_num = params['stride'], use_plot = params['use_plot'])
    train_dataloader = DataLoader(
                        train_dataset,
                        batch_size=params['batch_size'],
                        shuffle=True,
                        num_workers=params['num_workers'],
                        collate_fn = Collate_batch)

    val_transform = create_val_transform(True,size=params['size'])
    val_dataset = VideoDataset(params['dataset'], size=params['size'],mode='validation', play_time=params['clip_len'],frame_sample_rate=params['frame_sample_rate'], transform = val_transform, sub_classnum = params['num_classes'], label_num=params['label_num'], stride_num = params['stride'], use_plot = params['use_plot'])
    val_dataloader = DataLoader(
                        val_dataset,
                        batch_size=params['batch_size'],
                        shuffle=False,
                        num_workers=params['num_workers'],
                        collate_fn = Collate_batch)

    print('train_dataset : batch_size -> {}, step_size -> {}, frames -> {}'.format(params['batch_size'],len(train_dataloader), params['clip_len']))
    print('validation_dataset : batch_size -> {}, step_size -> {}, frames -> {}'.format(params['batch_size'],len(val_dataloader), params['clip_len']))
    print('=========================================================================================================')
    print('Load model : mode -> {}, label_size -> {}, sub_class_num -> {}'.format(params['mode'], params['label_num'], params['num_classes']))
    
    ### regression ###
    
    #model = generate_model('XL', n_classes = params['label_num'])
    
    #### class - one label ###
    model = None
    if params['mode']=='single':
        if params['model']=='slowfast':
            model = slowfastnet.resnet50(class_num=params['num_classes'], label_num = params['label_num'], mode = params['mode'])
        elif params['model']=='x3d':
            model = x3d.generate_model('S', n_classes = params['num_classes'])

    ### multi ###
    elif params['mode']=='multi':
        
        if params['model']=='slowfast':
            model = multi_slowfastnet.resnet50(class_num=params['num_classes'], label_num = params['label_num'], mode = params['mode'])
        
        
        elif params['model']=='x3d':
            if params['use_plot']:
                plots = train_dataset.plots
                #plots.extend(val_dataset.plots)
                plots = set(plots)
                counter = Counter()
                tokenizer = get_tokenizer('basic_english')
                for plot in plots:
                    counter.update(tokenizer(plot))
                vocab = Vocab(counter,min_freq=1)
                train_dataset.generate_text_pipeline(vocab,tokenizer)
                val_dataset.generate_text_pipeline(vocab, tokenizer) 
                model = multi_x3d_plot.generate_model('S', n_classes = params['num_classes'], label_num = params['label_num'], vocab_size = len(vocab))
            else:
                model = multi_x3d.generate_model('S', n_classes = params['num_classes'], label_num = params['label_num'])
        elif params['model']=='x3d_multitask':
            if params['use_plot']:
                plots = train_dataset.plots
                #plots.extend(val_dataset.plots)
                plots = list(set(plots))
                counter = Counter()
                tokenizer = get_tokenizer('basic_english')
                for plot in plots:
                    counter.update(tokenizer(plot))
                vocab = Vocab(counter,min_freq=1)
                train_dataset.generate_text_pipeline(vocab,tokenizer)
                val_dataset.generate_text_pipeline(vocab, tokenizer) 
                model = multi_x3d_plot_multitask_audio.generate_model('S', n_classes = params['num_classes'], label_num = params['label_num'], vocab_size = len(vocab))


        elif params['model']=='slowfast_multitask':
            if params['use_plot']:
                plots = train_dataset.plots
                #plots.extend(val_dataset.plots)
                #plots = list(set(plots))
                counter = Counter()
                tokenizer = get_tokenizer('basic_english')
                for plot in plots:
                    counter.update(tokenizer(plot))
                vocab = Vocab(counter,min_freq=1)
                train_dataset.generate_text_pipeline(vocab,tokenizer)
                val_dataset.generate_text_pipeline(vocab, tokenizer) 
                model = slowfast_lstm.resnet50(class_num=params['num_classes'], label_num = params['label_num'], mode = params['mode'], vocab_size = len(vocab))
                init_weights(model)

        elif params['model'] =='eff':
            model = efficientnet.EfficientNet3D.from_name('efficientnet-b{}'.format(params['eff']), override_params={'num_classes': params['num_classes']}, mode = params['mode'], label_num = params['label_num'])
        

    if params['pretrained'] != '':
        pretrained_dict = torch.load(params['pretrained'], map_location='cpu')
        try:
            model_dict = model.module.state_dict()
        except AttributeError:
            model_dict = model.state_dict()
        pretrained_dict = {k:v for k,v in pretrained_dict.items() if k in model_dict}
        print('load pretrained')
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)

    '''
        ########
        state = torch.load('./saved_model/{}'.format(params['pretrained']))
        model.load_state_dict(state['model'])
    '''

    model = model.to(device)

    ### regression ###
    '''
    criterion = Custom_MSELoss(num_classes = params['num_classes']).cuda()
    '''
    ### classification ###
    if params['mode']=='single':
        '''
        criterion = nn.CrossEntropyLoss(weight = train_dataset.get_class_weight().to(device))
        '''
        criterion =  Custom_CrossEntropyLoss(weight = train_dataset.get_class_weight().to(device))
    elif params['mode']=='multi':

        ### multi-class ##
        
        #criterion = Custom_MultiCrossEntropyLoss(weight = train_dataset.get_class_weight().to(device), label_num=params['label_num'])
    
        criterion1 = Custom_MultiCrossEntropyLoss(weight = train_dataset.get_class_weight2().to(device), label_num=params['label_num'])
        criterion2 = Custom_BCELoss()
        criterion3 = Custom_CrossEntropyLoss(weight = train_dataset.get_age_weight2().to(device))
    #optimizer = optim.SGD(model.parameters(), lr = params['learning_rate'], momentum = params['momentum'], weight_decay = params['weight_decay'])
    #scheduler = optim.lr_scheduler.StepLR(optimizer,  step_size = params['step'], gamma=0.1)

    optimizer = optim.SGD(model.parameters(),lr = params['learning_rate'],weight_decay=params['weight_decay'])
    #optimizer = optim.AdamW(model.parameters(), lr = params['learning_rate'], weight_decay = params['weight_decay'])

    #optimizer = optim.SGDW(model.parameters(), lr = params['learning_rate'], weight_decay = params['weight_decay'])
    #optimizer = SGDP(model.parameters(), lr = params['learning_rate'], weight_decay = params['weight_decay'], momentum=params['momentum'], nesterov=True)
    #optimizer = AdamP(model.parameters(), lr = params['learning_rate'], weight_decay = params['weight_decay'], betas = (0.9, 0.999))
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience = 2, factor = 0.5, verbose=False)
    #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 30, eta_min = 0)
    #scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=10, eta_max=0.01, T_up=10, gamma=0.5)
    model_save_dir = os.path.join(params['save_path'], 'second')
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    print("train gogosing")
    pre_metric = 0
    for epoch in range(params['epoch_num']):
        train(model, train_dataloader, epoch, criterion1, criterion2, criterion3, optimizer, writer, device, mode = params['mode'], label_num=params['label_num'], display = params['display'])
        if (epoch+1) % 5==0:
            print('======================================================')
            print('validation gogosing')
            metric= val(model, val_dataloader, epoch, criterion1, criterion2, criterion3, optimizer, writer, device, mode=params['mode'], label_num=params['label_num'])
            #validation_loss, metric = val(model, val_dataloader, epoch, criterion, optimizer, writer, device, mode=params['mode'], label_num=params['label_num'])
            #print('validation loss -> {loss:.5f}, \t f1_score -> {f1_score:.5f}'.format(loss = validation_loss, f1_score = metric))
            #checkpoint = os.path.join(model_save_dir, str(epoch) + '.pth.tar')
            #torch.save(model.state_dict(),checkpoint)
            if metric>pre_metric:
                pre_metric = metric
                if params['model'] == 'eff':
                    save_model(model, optimizer, schediler, epoch, params['model'] + params['eff'])
                else:
                    save_model(model,optimizer,scheduler,epoch, params['model'])
                model = model.to(device)
            print('Total F1_score : {metrics:.5f}'.format(metrics = metric))
            print('======================================================')

            scheduler.step(metric)

        #scheduler.step()

    writer.close()
Esempio n. 20
0
def hyper_parameter_tuning():
    mlp_hidden_dim_arr = [100, 200]
    EPOCHS = 150
    LSTM_LAYERS = [2, 4]
    word_embedding_dim_arr = [100, 200]
    pos_embedding_dim_arr = [25]
    hidden_dim_arr = [125, 175]

    # sanity check
    data_dir = "HW2-files/"
    path_train = data_dir + "train.labeled"
    print("path_train -", path_train)
    path_test = data_dir + "test.labeled"
    print("path_test -", path_test)

    paths_list = [path_train, path_test]
    word_cnt, word_dict, pos_dict = get_vocabs(paths_list)
    train = PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'train')
    # split into validation
    train_set, val_set = torch.utils.data.random_split(train, [4000, 1000])
    train_dataloader = DataLoader(train_set, shuffle=False)  # TODO return to true after debugging
    val_dataloader = DataLoader(val_set, shuffle=False)
    test = PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'test')
    test_dataloader = DataLoader(test, shuffle=False)

    # get pre-trained embedding
    glove = Vocab(Counter(word_dict), vectors="glove.6B.300d", specials=SPECIAL_TOKENS)

    a = next(iter(train_dataloader))
    # a[0] -> word - idx of a sentence
    # a[1] -> pos - idx of a sentence
    # a[2] -> head token per sentence
    assert len(a[0]) == len(a[1]) == len(a[2])

    word_vocab_size = len(train.word2idx)
    print(word_vocab_size)
    tag_vocab_size = len(train.pos_idx_mappings)
    print(tag_vocab_size)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")

    max_acc = 0
    max_mlp_hidden_dim = 0
    max_word_embedding_dim = 0
    max_pos_embedding_dim = 0
    max_hidden_dim = 0
    max_learning_rate = 0
    max_lstm_layers = 0

    for mlp_h_d in mlp_hidden_dim_arr:
        for word_e_d in word_embedding_dim_arr:
            for pos_e_d in pos_embedding_dim_arr:
                for hidden in hidden_dim_arr:
                    for num_lstm_layers in LSTM_LAYERS:
                        if USE_PRETRAINED:
                            model = AdvancedDnnDependencyParser(WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM,
                                                                word_vocab_size,
                                                                tag_vocab_size, word_embedding_table=glove.vectors).to(
                                device)
                        else:
                            model = AdvancedDnnDependencyParser(WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM,
                                                                word_vocab_size,
                                                                tag_vocab_size).to(device)
                        if use_cuda:
                            model.cuda()

                        # Define the loss function as the Negative Log Likelihood loss (NLLLoss)
                        loss_function = nn.NLLLoss()

                        # We will be using a simple SGD optimizer to minimize the loss function
                        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
                        acumulate_grad_steps = 128

                        accuracy_list = []
                        loss_list = []
                        best_val_acc = 0
                        num_epochs_no_improvement = 0
                        for epoch in range(EPOCHS):
                            val_acc = evaluate(model, val_dataloader)
                            if val_acc < best_val_acc:  # no improvement
                                num_epochs_no_improvement += 1
                                if num_epochs_no_improvement >= EARLY_STOPPING:
                                    # best config acc is saved in best_val_acc
                                    print(f"mlp_hidden: {mlp_h_d}, word_emb: {word_e_d}, pos_emb: {pos_e_d}, lstm_hidden: "
                                          f"{hidden}, num_lstm_layers:{num_lstm_layers} -> acc: {val_acc}")
                                    if val_acc > max_acc:
                                        max_acc = val_acc
                                        max_mlp_hidden_dim = mlp_h_d
                                        max_word_embedding_dim = word_e_d
                                        max_pos_embedding_dim = pos_e_d
                                        max_hidden_dim = hidden
                                        max_lstm_layers = num_lstm_layers
                                    break
                            else:  # improvement
                                # torch.save(model.state_dict(), PATH)
                                num_epochs_no_improvement = 0
                                best_val_acc = val_acc

                            # train
                            acc = 0  # to keep track of accuracy
                            printable_loss = 0  # To keep track of the loss value
                            i = 0
                            batch_loss = 0
                            batch_acc = 0
                            for batch_idx, input_data in enumerate(train_dataloader):
                                i += 1
                                words_idx_tensor, pos_idx_tensor, heads_tensor = input_data

                                tag_scores = model(words_idx_tensor, pos_idx_tensor)
                                loss = NLLL_function(tag_scores, heads_tensor[0].to(device))
                                loss = loss / acumulate_grad_steps
                                loss.backward()
                                batch_loss += loss
                                acc = (accuracy(heads_tensor[0].cpu(), tag_scores.cpu())) / acumulate_grad_steps
                                batch_acc += acc
                                if i % acumulate_grad_steps == 0:
                                    optimizer.step()
                                    model.zero_grad()
                                    batch_loss = 0
                                    batch_acc = 0
                                printable_loss += loss.item()
                                _, indices = torch.max(tag_scores, 1)
    print("best params:")
    print(f"mlp_hidden: {max_mlp_hidden_dim}, word_emb: {max_word_embedding_dim}, pos_emb: {max_pos_embedding_dim}, lstm_hidden: "
          f"{max_hidden_dim}, num_lstm_layers:{max_lstm_layers} -> acc: {max_acc}")
Esempio n. 21
0
def load_dataset(args):
    if args.dataset == '20newsgroup':
        train_classes, val_classes, test_classes = _get_20newsgroup_classes(
            args)
    elif args.dataset == 'amazon':
        train_classes, val_classes, test_classes = _get_amazon_classes(args)
    elif args.dataset == 'fewrel':
        train_classes, val_classes, test_classes = _get_fewrel_classes(args)
    elif args.dataset == 'huffpost':
        train_classes, val_classes, test_classes = _get_huffpost_classes(args)
    elif args.dataset == 'reuters':
        train_classes, val_classes, test_classes = _get_reuters_classes(args)
    elif args.dataset == 'rcv1':
        train_classes, val_classes, test_classes = _get_rcv1_classes(args)
    else:
        raise ValueError(
            'args.dataset should be one of'
            '[20newsgroup, amazon, fewrel, huffpost, reuters, rcv1]')

    assert (len(train_classes) == args.n_train_class)
    assert (len(val_classes) == args.n_val_class)
    assert (len(test_classes) == args.n_test_class)

    if args.mode == 'finetune':
        # in finetune, we combine train and val for training the base classifier
        train_classes = train_classes + val_classes
        args.n_train_class = args.n_train_class + args.n_val_class
        args.n_val_class = args.n_train_class

    tprint('Loading data')
    all_data = _load_json(args.data_path)

    tprint('Loading word vectors')
    path = os.path.join(args.wv_path, args.word_vector)
    if not os.path.exists(path):
        # Download the word vector and save it locally:
        tprint('Downloading word vectors')
        import urllib.request
        urllib.request.urlretrieve(
            'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec',
            path)

    vectors = Vectors(args.word_vector, cache=args.wv_path)
    vocab = Vocab(collections.Counter(_read_words(all_data)),
                  vectors=vectors,
                  specials=['<pad>', '<unk>'],
                  min_freq=5)

    # print word embedding statistics
    wv_size = vocab.vectors.size()
    tprint('Total num. of words: {}, word vector dimension: {}'.format(
        wv_size[0], wv_size[1]))

    num_oov = wv_size[0] - torch.nonzero(
        torch.sum(torch.abs(vocab.vectors), dim=1)).size()[0]
    tprint(('Num. of out-of-vocabulary words'
            '(they are initialized to zeros): {}').format(num_oov))

    # Split into meta-train, meta-val, meta-test data
    train_data, val_data, test_data = _meta_split(all_data, train_classes,
                                                  val_classes, test_classes)
    tprint('#train {}, #val {}, #test {}'.format(len(train_data),
                                                 len(val_data),
                                                 len(test_data)))

    # Convert everything into np array for fast data loading
    train_data = _data_to_nparray(train_data, vocab, args)
    val_data = _data_to_nparray(val_data, vocab, args)
    test_data = _data_to_nparray(test_data, vocab, args)

    train_data['is_train'] = True
    # this tag is used for distinguishing train/val/test when creating source pool

    stats.precompute_stats(train_data, val_data, test_data, args)

    if args.meta_w_target:
        # augment meta model by the support features
        if args.bert:
            ebd = CXTEBD()
        else:
            ebd = WORDEBD(vocab)

        train_data['avg_ebd'] = AVG(ebd, args)
        if args.cuda != -1:
            train_data['avg_ebd'] = train_data['avg_ebd'].cuda(args.cuda)

        val_data['avg_ebd'] = train_data['avg_ebd']
        test_data['avg_ebd'] = train_data['avg_ebd']

    # if finetune, train_classes = val_classes and we sample train and val data
    # from train_data
    if args.mode == 'finetune':
        train_data, val_data = _split_dataset(train_data, args.finetune_split)

    return train_data, val_data, test_data, vocab
Esempio n. 22
0
def main():
    # sanity check
    data_dir = "HW2-files/"
    path_train = data_dir + "train.labeled"
    print("path_train -", path_train)
    path_test = data_dir + "test.labeled"
    print("path_test -", path_test)

    paths_list = [path_train, path_test]
    word_cnt, word_dict, pos_dict = get_vocabs(paths_list)
    train = PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'train')
    # split into validation
    train_set, val_set = torch.utils.data.random_split(train, [4000, 1000])
    train_dataloader = DataLoader(train_set, shuffle=False)  # TODO return to true after debugging
    val_dataloader = DataLoader(val_set, shuffle=False)
    test = PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'test')
    test_dataloader = DataLoader(test, shuffle=False)

    # get pre-trained embedding
    glove = Vocab(Counter(word_dict), vectors="glove.6B.300d", specials=SPECIAL_TOKENS)


    a = next(iter(train_dataloader))
    # a[0] -> word - idx of a sentence
    # a[1] -> pos - idx of a sentence
    # a[2] -> head token per sentence
    assert len(a[0]) == len(a[1]) == len(a[2])

    word_vocab_size = len(train.word2idx)
    print(word_vocab_size)
    tag_vocab_size = len(train.pos_idx_mappings)
    print(tag_vocab_size)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    if USE_PRETRAINED:
        model = AdvancedDnnDependencyParser(WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, word_vocab_size,
                                        tag_vocab_size, word_embedding_table=glove.vectors).to(device)
    else:
        model = AdvancedDnnDependencyParser(WORD_EMBEDDING_DIM, POS_EMBEDDING_DIM, HIDDEN_DIM, word_vocab_size,
                                            tag_vocab_size).to(device)
    if use_cuda:
        model.cuda()

    # Define the loss function as the Negative Log Likelihood loss (NLLLoss)
    loss_function = nn.NLLLoss()

    # We will be using a simple SGD optimizer to minimize the loss function
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    acumulate_grad_steps = 128  # This is the actual batch_size, while we officially use batch_size=1

    # Training start
    print("Training Started")
    epoch_loss_list = []
    epoch_train_acc_list = []
    epoch_test_acc_list = []
    best_val_acc = 0
    num_epochs_wo_improvement = 0
    for epoch in range(EPOCHS):
        val_acc = evaluate(model, val_dataloader)
        print("EPOCH = ", epoch)
        print("EPOCH val acc = ", val_acc)
        if val_acc < best_val_acc:     # no improvement
            num_epochs_wo_improvement += 1
            if num_epochs_wo_improvement >= EARLY_STOPPING:
                print("STOPPED TRAINING DUE TO EARLY STOPPING")
                return
        else:                                   # improvement
            print("saving model since it improved on validation :)")
            torch.save(model.state_dict(), PATH)
            num_epochs_wo_improvement = 0
            best_val_acc = val_acc
            fig = plt.figure()
            plt.subplot(3, 1, 1)
            plt.plot(epoch_loss_list)
            plt.title("loss")
            plt.subplot(3, 1, 2)
            plt.plot(epoch_train_acc_list)
            plt.title("train UAS")
            plt.subplot(3, 1, 3)
            plt.plot(epoch_test_acc_list)
            plt.title("test UAS")
            print(epoch_train_acc_list)
            plt.savefig('./basic_model_graphs.png')

        # train
        acc = 0  # to keep track of accuracy
        printable_loss = 0  # To keep track of the loss value
        i = 0
        batch_loss = 0
        batch_acc = 0
        epoch_loss = 0

        for batch_idx, input_data in enumerate(train_dataloader):
            i += 1
            words_idx_tensor, pos_idx_tensor, heads_tensor = input_data

            tag_scores = model(words_idx_tensor, pos_idx_tensor)
            loss = NLLL_function(tag_scores, heads_tensor[0].to(device))
            # epoch statistics
            epoch_loss += loss
            #
            loss = loss / acumulate_grad_steps
            loss.backward()
            batch_loss += loss
            acc = (accuracy(heads_tensor[0].cpu(), tag_scores.cpu())) / acumulate_grad_steps
            batch_acc += acc
            if i % acumulate_grad_steps == 0:
                optimizer.step()
                model.zero_grad()
                print("batch_loss = ", batch_loss.item())
                print("batch_acc = ", batch_acc)
                batch_loss = 0
                batch_acc = 0
        # end of epoch - get statistics
        epoch_loss_list.append(epoch_loss / i)
        epoch_train_acc_list.append(evaluate(model, train_dataloader))
        epoch_test_acc_list.append(evaluate(model, test_dataloader))
    # end of train - plot the two graphs
    fig = plt.figure()
    plt.subplot(3, 1, 1)
    plt.plot(epoch_loss_list)
    plt.title("loss")
    plt.subplot(3, 1, 2)
    plt.plot(epoch_train_acc_list)
    plt.title("train UAS")
    plt.subplot(3, 1, 3)
    plt.plot(epoch_test_acc_list)
    plt.title("test UAS")
    plt.show()
    plt.savefig('basic_model_graphs.png')
Esempio n. 23
0
from torch.utils.data.dataset import random_split
train, test = random_split(data, [num_train, len(data) - num_train])

# =============================================================================
#
# =============================================================================

tokenizer = get_tokenizer('basic_english')
from torchtext.vocab import Vocab
from collections import Counter
#build vocab
counter = Counter()
for (text, label) in train:
    counter.update(tokenizer(text))

vocab = Vocab(counter, min_freq=1)

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x) - 1


def collate_batch(batch, max_len=300):
    label_list, text_list = [], []
    for (_text, _label) in batch:

        label_list.append(label_pipeline(_label))
        #processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        processed_text = text_pipeline(_text)
        if len(processed_text) >= max_len:
            processed_text = processed_text[0:max_len]
        else:
Esempio n. 24
0
    def __init__(self,
                 path: str,
                 word_dict=None,
                 pos_dict=None,
                 word_embd_dim=None,
                 pos_embd_dim=None,
                 test=None,
                 use_pre_trained=True,
                 pre_trained_vectors_name: str = None,
                 min_freq=1,
                 comp_mode=False):
        """
        :param path: path to train / test file
        :param word_dict: defaultdict(<class 'int'>, {'Pierre': 1, 'Vinken': 2, ',': 6268,...}
        :param pos_dict: defaultdict(<class 'int'>, {'NNP': 11837, ',': 6270, 'CD': 4493,...}
        :param word_embd_dim: dimension of word embedding
        :param pos_embd_dim: dimension of pos embedding
        :param test: if False / None we train vectors (or use-pertained).
                     else should be a list train.word_idx_mappings, train.pos_idx_mappings
        :param use_pre_trained: True / False
        :param pre_trained_vectors_name: What pre-trained vectors to use
        """
        super().__init__()
        self.file = path
        self.datareader = DataReader(self.file, word_dict, pos_dict)
        self.vocab_size = len(self.datareader.word_dict)
        self.char_idx_mappings = self.init_char_idx(word_dict)
        self.comp_mode = comp_mode
        if test:
            # no need to train vectors or create them, and also not vocabulary
            # that's because we use the vectors and vocabulary from train
            self.word_idx_mappings = test[0]
            self.pos_idx_mappings = test[1]
            self.sentences_dataset = self.convert_sentences_to_dataset()

        else:  # training
            if use_pre_trained:  # pre-trained word embeddings
                self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = \
                    self.init_word_embeddings(self.datareader.word_dict, pre_trained_vectors_name)
            else:
                # create Vocab variable just for the ease of using the special tokens and the other nice features
                # like it will create the word_idx_mapping by itself
                global SPECIAL_TOKENS
                vocab = Vocab(Counter(word_dict),
                              vectors=None,
                              specials=SPECIAL_TOKENS,
                              min_freq=min_freq)

                # set rand vectors and get the weights (the vector embeddings themselves)
                words_embeddings_tensor = nn.Embedding(len(
                    vocab.stoi), word_embd_dim).weight.data
                vocab.set_vectors(stoi=vocab.stoi,
                                  vectors=words_embeddings_tensor,
                                  dim=word_embd_dim)
                # take all 3 attributes like in the pre-trained part
                self.word_idx_mappings, self.idx_word_mappings, self.word_vectors = \
                    vocab.stoi, vocab.itos, vocab.vectors

            # pos embeddings
            self.pos_idx_mappings, self.idx_pos_mappings = self.init_pos_vocab(
            )
            self.pos_vectors = nn.Embedding(len(self.pos_idx_mappings),
                                            pos_embd_dim)

            self.word_vector_dim = self.word_vectors.size(-1)
            self.sentences_dataset = self.convert_sentences_to_dataset()
Esempio n. 25
0
    def _get_metadata(self, num_doc_classes, num_word_classes):
        labels = []
        if num_doc_classes:
            vocab = Vocab(Counter())
            vocab.itos = ["C_{}".format(i) for i in range(num_doc_classes)]
            label_meta = FieldMeta()
            label_meta.vocab_size = num_doc_classes
            label_meta.vocab = vocab
            labels.append(label_meta)

        if num_word_classes:
            vocab = Vocab(Counter())
            vocab.itos = ["W_{}".format(i) for i in range(num_word_classes)]
            label_meta = FieldMeta()
            label_meta.vocab_size = num_word_classes
            label_meta.vocab = vocab
            label_meta.pad_token_idx = 0
            labels.append(label_meta)

        w_vocab = Vocab(Counter())
        dict_vocab = Vocab(Counter())
        c_vocab = Vocab(Counter())
        d_vocab = Vocab(Counter())
        w_vocab.itos = W_VOCAB
        dict_vocab.itos = DICT_VOCAB
        c_vocab.itos = CHAR_VOCAB
        d_vocab.itos = []

        text_feat_meta = FieldMeta()
        text_feat_meta.unk_token_idx = UNK_IDX
        text_feat_meta.pad_token_idx = PAD_IDX
        text_feat_meta.vocab_size = W_VOCAB_SIZE
        text_feat_meta.vocab = w_vocab
        text_feat_meta.vocab_export_name = "tokens_vals"
        text_feat_meta.pretrained_embeds_weight = None
        text_feat_meta.dummy_model_input = TextFeatureField.dummy_model_input

        dict_feat_meta = FieldMeta()
        dict_feat_meta.vocab_size = DICT_VOCAB_SIZE
        dict_feat_meta.vocab = dict_vocab
        dict_feat_meta.vocab_export_name = "dict_vals"
        dict_feat_meta.pretrained_embeds_weight = None
        dict_feat_meta.dummy_model_input = DictFeatureField.dummy_model_input

        char_feat_meta = FieldMeta()
        char_feat_meta.vocab_size = CHAR_VOCAB_SIZE
        char_feat_meta.vocab = c_vocab
        char_feat_meta.vocab_export_name = "char_vals"
        char_feat_meta.pretrained_embeds_weight = None
        char_feat_meta.dummy_model_input = CharFeatureField.dummy_model_input

        dense_feat_meta = FieldMeta()
        dense_feat_meta.vocab_size = 0
        dense_feat_meta.vocab = d_vocab
        dense_feat_meta.vocab_export_name = "dense_vals"
        dense_feat_meta.pretrained_embeds_weight = None
        # ugh, dims are fixed
        dense_feat_meta.dummy_model_input = torch.tensor(
            [[1.0] * DENSE_FEATURE_DIM, [1.0] * DENSE_FEATURE_DIM],
            dtype=torch.float,
            device="cpu",
        )

        seq_feat_meta = FieldMeta()
        seq_feat_meta.unk_token_idx = UNK_IDX
        seq_feat_meta.pad_token_idx = PAD_IDX
        seq_feat_meta.vocab_size = W_VOCAB_SIZE
        seq_feat_meta.vocab = w_vocab
        seq_feat_meta.vocab_export_name = "seq_tokens_vals"
        seq_feat_meta.pretrained_embeds_weight = None
        seq_feat_meta.dummy_model_input = SeqFeatureField.dummy_model_input

        meta = CommonMetadata()
        meta.features = {
            DatasetFieldName.TEXT_FIELD: text_feat_meta,
            DatasetFieldName.DICT_FIELD: dict_feat_meta,
            DatasetFieldName.CHAR_FIELD: char_feat_meta,
            DatasetFieldName.DENSE_FIELD: dense_feat_meta,
            DatasetFieldName.SEQ_FIELD: seq_feat_meta,
        }
        meta.target = labels
        if len(labels) == 1:
            [meta.target] = meta.target
        meta.label_names = [label.vocab.itos for label in labels]
        meta.feature_itos_map = {
            f.vocab_export_name: f.vocab.itos
            for _, f in meta.features.items()
        }
        return meta
Esempio n. 26
0
def load_dataset(args):
    if args.dataset == '20newsgroup':
        train_classes, val_classes, test_classes, label_dict = _get_20newsgroup_classes(
            args)
    elif args.dataset == 'amazon':
        train_classes, val_classes, test_classes, label_dict = _get_amazon_classes(
            args)
    elif args.dataset == 'fewrel':
        train_classes, val_classes, test_classes, label_dict = _get_fewrel_classes(
            args)
    elif args.dataset == 'huffpost':
        train_classes, val_classes, test_classes, label_dict = _get_huffpost_classes(
            args)
    elif args.dataset == 'reuters':
        train_classes, val_classes, test_classes, label_dict = _get_reuters_classes(
            args)
    elif args.dataset == 'rcv1':
        train_classes, val_classes, test_classes, label_dict = _get_rcv1_classes(
            args)
    else:
        raise ValueError(
            'args.dataset should be one of'
            '[20newsgroup, amazon, fewrel, huffpost, reuters, rcv1]')

    assert (len(train_classes) == args.n_train_class)
    assert (len(val_classes) == args.n_val_class)
    assert (len(test_classes) == args.n_test_class)

    print("train_classes", train_classes)
    print("val_classes", val_classes)
    print("test_classes", test_classes)

    tprint('Loading data')
    all_data = _load_json(args.data_path)
    class_names = []
    class_name_words = []
    for ld in label_dict:
        class_name_dic = {}
        class_name_dic['label'] = label_dict[ld]
        class_name_dic['text'] = ld.lower().split()
        class_names.append(class_name_dic)
        class_name_words.append(class_name_dic['text'])

    tprint('Loading word vectors')

    vectors = Vectors(args.word_vector, cache=args.wv_path)
    vocab = Vocab(collections.Counter(_read_words(all_data, class_name_words)),
                  vectors=vectors,
                  specials=['<pad>', '<unk>'],
                  min_freq=5)

    # print word embedding statistics
    wv_size = vocab.vectors.size()
    tprint('Total num. of words: {}, word vector dimension: {}'.format(
        wv_size[0], wv_size[1]))

    num_oov = wv_size[0] - torch.nonzero(
        torch.sum(torch.abs(vocab.vectors), dim=1)).size()[0]
    tprint(('Num. of out-of-vocabulary words'
            '(they are initialized to zeros): {}').format(num_oov))

    # Split into meta-train, meta-val, meta-test data
    train_data, val_data, test_data = _meta_split(all_data, train_classes,
                                                  val_classes, test_classes)
    tprint('#train {}, #val {}, #test {}'.format(len(train_data),
                                                 len(val_data),
                                                 len(test_data)))

    # Convert everything into np array for fast data loading
    class_names = _data_to_nparray(class_names, vocab, args)
    train_data = _data_to_nparray(train_data, vocab, args)
    val_data = _data_to_nparray(val_data, vocab, args)
    test_data = _data_to_nparray(test_data, vocab, args)

    train_data['is_train'] = True
    val_data['is_train'] = True
    test_data['is_train'] = True
    # this tag is used for distinguishing train/val/test when creating source pool

    temp_num = np.argsort(class_names['label'])
    class_names['label'] = class_names['label'][temp_num]
    class_names['text'] = class_names['text'][temp_num]
    class_names['text_len'] = class_names['text_len'][temp_num]

    return train_data, val_data, test_data, class_names, vocab
Esempio n. 27
0
    def __init__(self, file_path, vocab=None):
        # load the entire master table
        self.data = MasterData(file_path)

        if vocab is not None:
            self.vocab = vocab
        else:
            path = './wiki.en.vec'
            if not os.path.exists(path):
                # Download the word vector and save it locally:
                print('Downloading word vectors')
                import urllib.request
                urllib.request.urlretrieve(
                    'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec',
                    path)

            # get word embeddings from fasttext
            vectors = Vectors('wiki.en.vec', cache='vector_cache')
            self.vocab = Vocab(Counter(self.data.all_text),
                               vectors=vectors,
                               specials=['<pad>', '<unk>'],
                               min_freq=5)

            # print word embedding statistics
            wv_size = self.vocab.vectors.size()
            print('Total num. of words: {}, word vector dimension: {}'.format(
                wv_size[0], wv_size[1]))

            num_oov = wv_size[0] - torch.nonzero(torch.sum(
                torch.abs(self.vocab.vectors), dim=1),
                                                 as_tuple=False).size()[0]
            print(('Num. of out-of-vocabulary words'
                   '(they are initialized to zeros): {}').format(num_oov))

        # get the idx of label and cor
        cor_name = 'breast cancer'
        self.label_idx = self.data.attr_names.index('penetrance')
        self.cor_idx = self.data.attr_names.index(cor_name)

        # define train / val / test split
        random.seed(1)
        train_list = list(range(int(len(self.data.text) * 0.5)))
        val_list = list(
            range(int(len(self.data.text) * 0.5),
                  int(len(self.data.text) * 0.7)))
        test_list = list(
            range(int(len(self.data.text) * 0.7),
                  int(len(self.data.text) * 1)))

        self.envs = [{'idx_list': []}, {'idx_list': []}]
        # define training environments based on the values of the spurious
        # attributes
        for idx in train_list:
            cor = self.data.attr[idx, self.cor_idx]
            if cor == 0:
                self.envs[0]['idx_list'].append(idx)
            else:
                self.envs[1]['idx_list'].append(idx)

        # define val and test environments
        self.envs.append({'idx_list': val_list})
        self.envs.append({'idx_list': test_list})

        # compute correlation between the given attribute cor and the target attribute
        # on the validation set for early stopping
        self.val_att_idx_dict = {
            cor_name: {
                '0_0': [],
                '0_1': [],
                '1_0': [],
                '1_1': []
            }
        }
        for i in val_list:
            k = '{}_{}'.format(self.data.attr[i, self.label_idx],
                               self.data.attr[i, self.cor_idx])
            self.val_att_idx_dict[cor_name][k].append(i)

        # compute correlation between each attribute and the target attribute
        # only for the test set
        self.test_att_idx_dict = {}
        for idx, att in enumerate(self.data.attr_names):
            if idx == self.label_idx:
                continue

            data_dict = {
                '0_0': [],
                '0_1': [],
                '1_0': [],
                '1_1': [],
            }

            # go through only the test examples
            for i in test_list:
                k = '{}_{}'.format(self.data.attr[i, self.label_idx],
                                   self.data.attr[i, idx])
                data_dict[k].append(i)

            # print data stats
            print('{:>20}'.format(att), end=' ')
            for k, v in data_dict.items():
                print(k, ' ', '{:>8}'.format(len(v)), end=', ')
            print()

            self.test_att_idx_dict[att] = data_dict

        self.length = len(self.data.attr)
Esempio n. 28
0
File: data.py Progetto: hbwzhsh/UDC
    def __init__(self,
                 path='data',
                 glove_p='glove',
                 train_file='train.csv',
                 valid_file='valid.csv',
                 test_file='test.csv',
                 vocab_file=None,
                 batch_size=32,
                 embed_dim=100,
                 max_vocab_size=None,
                 min_freq=1,
                 max_seq_len=None,
                 gpu=False,
                 use_fasttext=False,
                 padded=False):
        self.batch_size = batch_size
        self.device = 0 if gpu else -1
        self.sort_key = lambda x: len(x.context)
        #print (self.sort_key)

        if not padded:
            self.TEXT = data.Field(lower=True,
                                   pad_token='__pad__',
                                   unk_token='<UNK>',
                                   batch_first=True,
                                   tokenize=clean_str)
        else:
            self.TEXT = data.Field(lower=True,
                                   include_lengths=True,
                                   fix_length=max_seq_len,
                                   unk_token='<UNK>',
                                   batch_first=True,
                                   tokenize=clean_str)

        self.LABEL = data.Field(sequential=False,
                                tensor_type=torch.FloatTensor,
                                unk_token=None,
                                batch_first=True)

        file_format = train_file[-3:]

        # Only take data with max length 160
        # f = lambda ex: len(ex.context) <= max_seq_len and len(ex.response)
        f = None

        self.train = data.TabularDataset(path='{}/{}'.format(path, train_file),
                                         format=file_format,
                                         skip_header=True,
                                         fields=[('context', self.TEXT),
                                                 ('response', self.TEXT),
                                                 ('label', self.LABEL)],
                                         filter_pred=f)

        self.valid, self.test = data.TabularDataset.splits(
            path=path,
            validation=valid_file,
            test=test_file,
            format=file_format,
            skip_header=True,
            fields=[('context', self.TEXT), ('positive', self.TEXT),
                    ('negative_1', self.TEXT), ('negative_2', self.TEXT),
                    ('negative_3', self.TEXT), ('negative_4', self.TEXT),
                    ('negative_5', self.TEXT), ('negative_6', self.TEXT),
                    ('negative_7', self.TEXT), ('negative_8', self.TEXT),
                    ('negative_9', self.TEXT)])

        if vocab_file is None:

            if use_fasttext:
                print("building vocabulary")
                # self.TEXT.build_vocab(
                #     self.train, max_size=max_vocab_size, min_freq=3,
                #     vectors="fasttext.en.300d"
                # )
                self.TEXT.build_vocab(self.train,
                                      max_size=max_vocab_size,
                                      min_freq=5,
                                      vectors="fasttext.en.300d")
            else:
                self.TEXT.build_vocab(self.train,
                                      max_size=max_vocab_size,
                                      min_freq=min_freq,
                                      vectors=GloVe('6B', dim=embed_dim))
            vocab = self.TEXT.vocab

            self.TEXT.build_vocab(self.train,
                                  max_size=max_vocab_size,
                                  min_freq=min_freq,
                                  vectors=GloVe('840B', dim=embed_dim))

        else:
            specials = list(
                OrderedDict.fromkeys(tok for tok in [
                    self.TEXT.unk_token, self.TEXT.pad_token,
                    self.TEXT.init_token, self.TEXT.eos_token
                ] if tok is not None))

            with open(f'{path}/{vocab_file}', 'r') as f:
                counter = Counter(f.read().split('\n'))

            if use_fasttext:
                print("Using fasttext")
                vocab = Vocab(counter,
                              specials=specials,
                              vectors="fasttext.en.300d")
            else:
                vocab = Vocab(counter,
                              specials=specials,
                              vectors=GloVe('6B', dim=embed_dim))

            self.TEXT.vocab = vocab

        self.LABEL.build_vocab(self.train)
        print(vocab.stoi['__pad__'])
        print(vocab.itos[25], vocab.itos[32])
        self.dataset_size = len(self.train.examples)
        self.vocab_size = len(self.TEXT.vocab.itos)
        self.embed_dim = embed_dim
        #self.vectors = self.load_glove_embeddings(glove_p+'/glove.6B.50d.txt', self.TEXT.vocab.stoi)
        self.vectors = self.TEXT.vocab.vectors
Esempio n. 29
0
def benchmark_experimental_vocab_lookup(vocab_file_path=None):
    def _run_benchmark_lookup(tokens, vocab):
        t0 = time.monotonic()
        # list lookup
        if isinstance(tokens, list) and isinstance(tokens[0], list):
            for tokens_list in tokens:
                vocab.lookup_indices(tokens_list)
        # single token lookup
        elif isinstance(tokens, list):
            for token in tokens:
                vocab[token]
        else:
            raise RuntimeError("Received tokens of incorrect type {}.".format(
                type(tokens)))
        print("Lookup time:", time.monotonic() - t0)

    tokens = []
    tokens_lists = []

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    for (_, text) in train:
        cur_tokens = []
        for id in text.tolist():
            cur_tokens.append(vocab.itos[id])
        tokens_lists.append(cur_tokens)
        tokens += cur_tokens

    if vocab_file_path:
        print("Loading Vocab from file {}".format(vocab_file_path))

        def token_iterator(file_path):
            f = open(file_path, 'r')
            for token in f:
                yield token

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = build_vocab_from_iterator(token_iterator(vocab_file_path))
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        f = open(vocab_file_path, 'r')
        v_experimental = vocab_from_file_object(f)
        print("Construction time:", time.monotonic() - t0)
    else:
        print("Loading Vocab from AG News")
        counter = Counter(tokens)
        sorted_by_freq_tuples = sorted(counter.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
        ordered_dict = OrderedDict(sorted_by_freq_tuples)

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = Vocab(counter)
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        v_experimental = VocabExperimental(ordered_dict)
        print("Construction time:", time.monotonic() - t0)
    jit_v_experimental = torch.jit.script(v_experimental.to_ivalue())

    # existing Vocab eager lookup
    print("Vocab - Eager Mode")
    _run_benchmark_lookup(tokens, v_existing)
    _run_benchmark_lookup([tokens], v_existing)
    _run_benchmark_lookup(tokens_lists, v_existing)

    # experimental Vocab eager lookup
    print("Vocab Experimental - Eager Mode")
    _run_benchmark_lookup(tokens, v_experimental)
    _run_benchmark_lookup([tokens], v_experimental)
    _run_benchmark_lookup(tokens_lists, v_experimental)

    jit_v_experimental = torch.jit.script(v_experimental.to_ivalue())
    # experimental Vocab jit lookup
    print("Vocab Experimental - Jit Mode")
    _run_benchmark_lookup(tokens, jit_v_experimental)
    _run_benchmark_lookup([tokens], jit_v_experimental)
    _run_benchmark_lookup(tokens_lists, jit_v_experimental)
Esempio n. 30
0
 def build_vocab(self, *args, **kwargs):
     specials = [REDUCE, SHIFT]
     for prod in self.productions:
         specials.append(NP(prod.data))
     self.vocab = Vocab(Counter(), specials=specials)