def __init__(self):
        self.data = []
        self.dictionary = Dictionary()
        self.max_sent_len = 0

        # Read the positive reviews
        with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f:
            positive_reviews = f.readlines()
        for review in positive_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 1))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Read the negative reviews
        with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f:
            negative_reviews = f.readlines()
        for review in negative_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 0))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Split the original dataset into train/test
        random.shuffle(self.data)
        split_index = int(0.9 * len(self.data))
        self.train = AugmentedList(self.data[:split_index])
        self.test = AugmentedList(self.data[split_index:])
Beispiel #2
0
    def __init__(self,
                 video_path,
                 translation_path,
                 spatial_transform=None,
                 temporal_transform=None,
                 sample_duration=4,
                 get_loader=get_default_video_loader):
        self.dictionary = Dictionary()
        self.data = make_dataset(video_path, translation_path, sample_duration,
                                 self.dictionary)

        self.spatial_transform = spatial_transform
        self.temporal_transform = temporal_transform
        self.loader = get_loader()
Beispiel #3
0
    def __init__(self, path, maxlen, vocab_size=11000, lowercase=False, dictionary=None):
        self.dictionary = Dictionary()
        self.maxlen = maxlen
        if maxlen  == -1:
            self.maxlen = np.inf
        self.lowercase = lowercase
        self.vocab_size = vocab_size
        self.train_path = path
        self.text = []
        self.hiddens = []
        self.labels = []
        # make the vocabulary from training set
        if dictionary is None:
            self.make_vocab()
        else:
            self.dictionary = dictionary

        self.train = self.tokenize(self.train_path)
        self.Item = namedtuple('Item', ['text', 'hidden', 'label'])
Beispiel #4
0
    if not best_acc or acc > best_acc:
        with open(MODEL_PATH % (dev_loss, acc), 'wb') as f:
            torch.save(model, f)
        best_acc = acc
        stop_counter = 0
    else:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.2
        if EARLY_STOP != 0:
            stop_counter += 1

    return stop_counter


dictionary = Dictionary(path=DICT_PATH)

n_token = len(dictionary)

best_dev_loss = None
best_acc = None

model = Classifier({
    'dropout': DROPOUT,
    'n_tokens': n_token,
    'n_layers': N_LAYERS,
    'hidden_dim': HIDDEN_DIM,
    'embed_dim': EMBED_DIM,
    'pooling': POOLING,
    'dictionary': dictionary,
    'pretrained_wordvec': PRETRAINED_WORDVEC,
Beispiel #5
0
import jieba
import jieba.posseg  #需要另外加载一个词性标注模块

import pandas as pd
# from nltk.tokenize import word_tokenize

P2P = 'total'
# ********************************************************************* #
datapath = '../../data/' + P2P + '/' + P2P + '.csv'
outpath = '../../data/' + P2P + '/data(%s).json'
dictpath = '../../data/' + P2P + '/mydict(%s).json'
debug_flag = False
stop = False
# ********************************************************************* #

mydict = Dictionary()
mydict.add_word('<pad>')
# mydict.add_word('<unk>')
stopping_word = open('../../data/stopping_word', 'r',
                     encoding='utf-8').readlines()
for i in range(len(stopping_word)):
    stopping_word[i] = stopping_word[i].strip()

reviews = pd.read_csv(datapath, index_col=0, header=0, encoding='utf-8')
labels = list(reviews['reviewEvaluation'])
reviews = list(reviews['reviewContent'])
# reviews = open(datapath).readlines()
n_reviews = len(reviews)
print('%d条评论将被载入...' % n_reviews)

if debug_flag:
Beispiel #6
0
    path = 'wili'
    print(os.listdir(path))

    # Init random seed to get reproducible results
    seed = 1111
    random.seed(seed)
    np.random.RandomState(seed)
    torch.manual_seed(seed)

    # Any results you write to the current directory are saved as output.
    x_train_full = open(path + "/x_train.txt").read().splitlines()
    y_train_full = open(path + "/y_train.txt").read().splitlines()
    print('Example:')
    print('LANG =', y_train_full[0])
    print('TEXT =', x_train_full[0])
    char_vocab = Dictionary()
    pad_token = '<pad>'  # reserve index 0 for padding
    unk_token = '<unk>'  # reserve index 1 for unknown token
    pad_index = char_vocab.add_token(pad_token)
    unk_index = char_vocab.add_token(unk_token)

    # join all the training sentences in a single string
    # and obtain the list of different characters with set
    chars = set(''.join(x_train_full))
    for char in sorted(chars):
        char_vocab.add_token(char)
    print("Vocabulary:", len(char_vocab), "UTF characters")

    lang_vocab = Dictionary()
    # use python set to obtain the list of languages without repetitions
    languages = set(y_train_full)
Beispiel #7
0
from io import BytesIO
from PIL import Image
from utils import Dictionary, get_words, load_model
import base64
import difflib
import numpy as np
import torch.nn.functional as F
import torch

net_recog = load_model('../recog_params.pkl')
ug_dict = Dictionary('../../../data/ug_words.txt')


def img2str(pic):
    # pic: numpy array
    figfile = BytesIO()
    pic = Image.fromarray(pic).convert('RGBA').save(figfile, format='PNG')
    figfile.seek(0, 0)
    figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii')
    return figdata_png


def compute(test_id='test'):
    pic_path = 'static/' + test_id + '.png'
    truth_path = 'static/' + test_id + '.txt'
    base_image, pic_with_box, word_pics = get_words(pic_path)
    word_pics = np.stack(word_pics)
    pred = F.softmax(net_recog(torch.from_numpy(word_pics)), dim=1)
    _, idxes = torch.max(pred, dim=1)

    res = []
print(args)

start_time = time.time()

corpus_output_name = args.save + "corpus_index"
dictionary_output_name = args.save + "dictionary_index"

if not os.path.exists(args.save):
    os.makedirs(args.save)

f_out = open(corpus_output_name, 'w')

my_open = open
byte_mode = False
        
dict_c = Dictionary(byte_mode)

total_num_w = 0
with my_open(args.data, 'r') as f_in:
    for line_idx, line in enumerate(f_in):
        sent_spacy, gpt2_idx, gpt2_mapping = line.rstrip().split('\t')
        w_ind_list = []
        for w in sent_spacy.split():
            w_ind = dict_c.dict_check_add(w)
            w_ind_list.append(w_ind)
            total_num_w += 1
        dict_c.append_eos(w_ind_list)
        f_out.write(' '.join([str(x) for x in w_ind_list]) + '\t' + gpt2_idx + '\t' + gpt2_mapping + '\n')
        if line_idx % 1000000 == 0:
            print(line_idx)
            sys.stdout.flush()
Beispiel #9
0
                lines.append(indices)

        print("Number of sentences dropped from {}: {} out of {} total".
              format(path, dropped, linecount))
        return lines

    def __index__(self, i):
        return self.Item(self.text[i], self.hiddens[i], self.labels[i])

model_args, idx2word, autoencoder, gan_gen, gan_disc \
        = load_models(args.load_path)
# print(idx2word)

word2idx = {word : index for index, word in idx2word.items()}

dic = Dictionary()
dic.word2idx = word2idx
dic.idx2word = idx2word

corpus = EncodeCopus(args.inf, maxlen=-1, dictionary=dic)

batches = corpus.get_batches()
autoencoder.cuda()
autoencoder.eval()

with open(args.inf + '.corpus', 'wb') as b:
    hiddens = []
    for index, (source, target, length) in enumerate(batches):
        # print(source, length)
        hidden = autoencoder.encode(Variable(source), length,None)
        # print(hidden)
Beispiel #10
0
    def __init__(self,
                 corpus_dir,
                 w2v,
                 dictionary=None,
                 w2v_lbound=16,
                 w2v_ubound=2**16,
                 corpus_lbound=2,
                 ctx_len=12,
                 pad=0,
                 is_wikitext=False,
                 is_chimera=False,
                 is_jnlpba=False):
        if dictionary is None:
            dictionary = Dictionary(w2v.vector_size)

        if is_wikitext:
            corpus = [
                fi.lower().split()
                for fi in (corpus_dir /
                           'wiki.train.tokens').open().readlines()
            ]
            corpus += [
                fi.lower().split()
                for fi in (corpus_dir /
                           'wiki.valid.tokens').open().readlines()
            ]
            corpus += [
                fi.lower().split()
                for fi in (corpus_dir / 'wiki.test.tokens').open().readlines()
            ]
            corpus = np.array(corpus)
        elif is_chimera:
            corpus = []
            with (corpus_dir / 'dataset.txt').open(encoding='latin1') as f:
                lines = f.readlines()[1:]
                for i in range(0, len(lines), 2):
                    fields = lines[i].rstrip('\n').split('\t')
                    nonce = fields[1].lower()
                    sents = fields[3].lower().split('@@')
                    pivot_comp = lines[i + 1].split('\t')[5].lower().split('_')
                    corpus += [
                        sent.replace(nonce, pivot_comp[0 if i %
                                                       2 == 0 else 1]).split()
                        for i, sent in enumerate(sents)
                    ]
            corpus = np.unique(corpus)
        elif is_jnlpba:
            ps = ['train/Genia4ERtask2.iob2', 'test/Genia4EReval2.iob2']
            corpus = []
            sent = []
            for p in ps:
                for w in (corpus_dir / p).open().readlines():
                    if w.startswith("###MEDLINE:"):
                        if sent:
                            corpus += [sent]
                        sent = []
                        continue

                    w = w.strip().lower()
                    if w != '':
                        w = w.split()
                        w = w[0]
                        sent += [w]
                corpus += [sent]
            corpus = np.array(corpus)
        print(f"Corpus shape: {corpus.shape}")

        word_count = defaultdict(int)
        oov_words = []
        oov_dataset = {}
        for sent in corpus:
            for w in sent:
                word_count[w] += 1
                dictionary.add_word(w, w2v)
                if w not in oov_dataset and w not in dictionary.word2idx:
                    if w in string.punctuation:
                        continue
                    oov_words.append(w)
                    oov_dataset[w] = [[], []]

        words = []
        for w in dictionary.word2idx:
            if w != '<unk>' and w2v_ubound > w2v.wv.vocab[
                    w].count > w2v_lbound and word_count[w] > corpus_lbound:
                words.append(w)
        print(f"Number of valid words: {len(words)}")

        train_dataset = {}
        valid_dataset = {}
        for w, prob in zip(words, np.random.random(len(words))):
            if prob < 0.9:
                train_dataset[w] = [[], []]
            else:
                valid_dataset[w] = [[], []]

        for sent in corpus:
            words_valid = []
            words_train = []
            words_oov = []

            for idx, w in enumerate(sent):
                if w in valid_dataset:
                    words_valid += [[w, idx]]
                elif w in train_dataset:
                    words_train += [[w, idx]]
                elif w in oov_dataset:
                    words_oov += [[w, idx]]

            if len(words_valid) > 0 or len(words_train) > 0 or len(
                    words_oov) > 0:
                sent_word_ids = dictionary.sent2idx(sent)

                if len(words_valid) > 0:
                    for w, idx in words_valid:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            valid_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            valid_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

                if len(words_train) > 0:
                    for w, idx in words_train:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            train_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            train_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

                if len(words_oov) > 0:
                    for w, idx in words_oov:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            oov_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            oov_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

        for w in valid_dataset:
            lefts = pad_sequences(valid_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(valid_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            valid_dataset[w] = np.concatenate((lefts, rights), axis=1)

        for w in train_dataset:
            lefts = pad_sequences(train_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(train_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            train_dataset[w] = np.concatenate((lefts, rights), axis=1)

        for w in oov_dataset:
            lefts = pad_sequences(oov_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(oov_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            oov_dataset[w] = np.concatenate((lefts, rights), axis=1)

        print(f"Train size: {len(train_dataset.keys())}")
        print(f"Valid size: {len(valid_dataset.keys())}")
        print(f"OOV size: {len(oov_words)}")

        print(
            f"Train >0 ctxts size: {len([w for w in train_dataset.keys() if len(train_dataset[w]) > 0])}"
        )
        print(
            f"Valid >0 ctxts size: {len([w for w in valid_dataset.keys() if len(valid_dataset[w]) > 0])}"
        )
        print(
            f"OOV >0 ctxts size: {len([w for w in oov_words if len(oov_dataset[w]) > 0])}"
        )

        train_ctxt_lens = [len(train_dataset[w]) for w in train_dataset.keys()]
        valid_ctxt_lens = [len(valid_dataset[w]) for w in valid_dataset.keys()]
        oov_ctxt_lens = [len(oov_dataset[w]) for w in oov_words]

        print(
            f"Number of Train with ctxts size = index {[train_ctxt_lens.count(i) for i in range(10)]}"
        )
        print(
            f"Number of Valid with ctxts size = index {[valid_ctxt_lens.count(i) for i in range(10)]}"
        )
        print(
            f"Number of OOV with ctxts size = index {[oov_ctxt_lens.count(i) for i in range(10)]}"
        )

        oov_word_counts = [word_count[w] for w in oov_words]
        inds = np.argsort(-np.array(oov_word_counts))[:10]

        print(
            f"Number of OOV words with count = index {[oov_word_counts.count(i) for i in range(10)]}"
        )
        print(f"Most frequent OOV words: {[oov_words[i] for i in inds]} "
              f"frequencies: {[oov_word_counts[i] for i in inds]}")

        self.dictionary = dictionary
        self.train_dataset = train_dataset
        self.train_words = list(train_dataset.keys())
        self.valid_dataset = valid_dataset
        self.valid_words = list(valid_dataset.keys())
        self.oov_dataset = oov_dataset
        self.oov_words = list(oov_dataset.keys())
        self.w2v = w2v
        self.ctx_len = ctx_len
        self.train_k2words = {}
        self.valid_k2words = {}
Beispiel #11
0
    logging.info('batch size: {}'.format(args.batch_size))
    if args.algo == 'FL':
        logging.info('epochs: {}'.format(args.train_epochs))
    logging.info('*' * 52)

    if args.model == 'LSTM':
        # load shakespeare data (client: 715)
        d_train = load_data('shakespeare_train.h5', args)
        d_test = load_data('shakespeare_test.h5', args)

        # split data into two parts: support client / test client
        support_train_str, support_test_str, test_train_str, test_test_str = split_data(
            d_train, d_test, args)

        # preprocess data and construct a dictionary for whole data
        corpus = Dictionary()
        support_train, support_test = lstm_data_process(
            support_train_str, support_test_str, corpus, args)
        test_train, test_test = lstm_data_process(test_train_str,
                                                  test_test_str, corpus, args)
        args.ntokens = len(corpus)
        args.corpus = corpus

        if args.algo == 'FL':
            FL_LSTM(support_train, support_test, test_train, test_test, args)
        elif args.algo == 'FR':
            FR_LSTM(support_train, support_test, test_train, test_test, args)
    else:
        # load MNIST data (client: 3383 / train: 341873 / test: 40832)
        d_train = load_data('fed_emnist_digitsonly_train.h5', args)
        d_test = load_data('fed_emnist_digitsonly_test.h5', args)