def __init__(self): self.data = [] self.dictionary = Dictionary() self.max_sent_len = 0 # Read the positive reviews with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f: positive_reviews = f.readlines() for review in positive_reviews: review = normalize_string(review) review_words = word_tokenize(review) self.dictionary.add_words(review_words) self.data.append((review, 1)) self.max_sent_len = max(self.max_sent_len, 2 + len(review_words)) # Read the negative reviews with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f: negative_reviews = f.readlines() for review in negative_reviews: review = normalize_string(review) review_words = word_tokenize(review) self.dictionary.add_words(review_words) self.data.append((review, 0)) self.max_sent_len = max(self.max_sent_len, 2 + len(review_words)) # Split the original dataset into train/test random.shuffle(self.data) split_index = int(0.9 * len(self.data)) self.train = AugmentedList(self.data[:split_index]) self.test = AugmentedList(self.data[split_index:])
class ReviewDataset: def __init__(self): self.data = [] self.dictionary = Dictionary() self.max_sent_len = 0 # Read the positive reviews with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f: positive_reviews = f.readlines() for review in positive_reviews: review = normalize_string(review) review_words = word_tokenize(review) self.dictionary.add_words(review_words) self.data.append((review, 1)) self.max_sent_len = max(self.max_sent_len, 2 + len(review_words)) # Read the negative reviews with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f: negative_reviews = f.readlines() for review in negative_reviews: review = normalize_string(review) review_words = word_tokenize(review) self.dictionary.add_words(review_words) self.data.append((review, 0)) self.max_sent_len = max(self.max_sent_len, 2 + len(review_words)) # Split the original dataset into train/test random.shuffle(self.data) split_index = int(0.9 * len(self.data)) self.train = AugmentedList(self.data[:split_index]) self.test = AugmentedList(self.data[split_index:]) def next_batch(self, batch_size, mode=TRAIN_MODE): review_lengths, reviews, targets = [], [], [] data = self.train if mode == TRAIN_MODE else self.test batch = data.next_items(batch_size) for (review, target) in batch: review_length = len(word_tokenize(normalize_string(review))) review = indexes_from_sentence(review, self.dictionary, self.max_sent_len) target = one_hot_encoding(2, target) reviews.append(review) targets.append(target) review_lengths.append(review_length) return review_lengths, reviews, targets
def __init__(self, video_path, translation_path, spatial_transform=None, temporal_transform=None, sample_duration=4, get_loader=get_default_video_loader): self.dictionary = Dictionary() self.data = make_dataset(video_path, translation_path, sample_duration, self.dictionary) self.spatial_transform = spatial_transform self.temporal_transform = temporal_transform self.loader = get_loader()
def __init__(self, sql=None, mongo=None, **kwargs): super(TweetTagger, self).__init__() if not sql: self._sql = SQLConnector(host=kwargs['host'], port=kwargs['port'], user=kwargs['user'], passwd=kwargs['password'], db=kwargs['db']) else: self._sql = sql if not mongo: self._mongo = MongoConnector(host=kwargs['H'], port=kwargs['mongoport'], db=kwargs['db']) else: self._mongo = mongo self._keyword = None self._bing = BingSearch() self._binged = Dictionary()
def __init__(self, path, maxlen, vocab_size=11000, lowercase=False, dictionary=None): self.dictionary = Dictionary() self.maxlen = maxlen if maxlen == -1: self.maxlen = np.inf self.lowercase = lowercase self.vocab_size = vocab_size self.train_path = path self.text = [] self.hiddens = [] self.labels = [] # make the vocabulary from training set if dictionary is None: self.make_vocab() else: self.dictionary = dictionary self.train = self.tokenize(self.train_path) self.Item = namedtuple('Item', ['text', 'hidden', 'label'])
def tag_tweets(ngrams, tweet_id): tweet = Dictionary() tweet.add("tweet_db_id", tweet_id) prev_is_software = False for i in range(len(ngrams), 0, -1): for word in ngrams[i]: if prev_is_software: if check_version(word): tweet.add("version", word) prev_is_software = False # Look for 'Get x free' # This doesn't always work, eg 'get your free ...' / 'get it free' # TODO: Also look for 'Get x on' etc # Also look for 'Download x now' etc elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Ff][Rr][Ee][Ee]$", word): software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip() software = software.replace(re.findall(re.compile(r"[Ff][Rr][Ee][Ee]$"), word)[0], "").strip() if not sql.isSoftware(software): try: if check_bing(software, bing): # Add newly-found software names to list, add to dictionary at end new_software.add(software, tweet) possible_tags.append(tweet_id) # sql.insertSoftware(software) # This task now done at end except ServerError, e: print e raise IncompleteTaggingError() tweet.add("price", "free") # REQUIRES REFACTORING elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Nn][Oo][Ww]$", word): software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip() software = software.replace(re.findall(re.compile(r"[Nn][Oo][Ww]$"), word)[0], "").strip() if not sql.isSoftware(software): try: if check_bing(software, bing): # Add newly-found software names to list, add to dictionary at end new_software.add(software, tweet) possible_tags.append(tweet_id) except ServerError, e: print e raise IncompleteTaggingError()
def _ngram_tagger(self, ngram, tweet_id): tags = Dictionary() tags.add('tweet_db_id', tweet_id) if self._keyword: keyword = self._keyword if self._sql.isSoftware(keyword): entry = self._sql.getSoftware() tags.add('software_name', keyword) tags.add('software_id', str(entry[0])) elif self._sql.isCompany(keyword): entry = self._sql.getCompany() tags.add('company_name', keyword) tags.add('company_id', str(entry[0])) elif self._sql.isOS(keyword): entry = self._sql.getOS() tags.add('os_name', keyword) tags.add('os_id', str(entry[0])) for tagged_words in ngram: self._tagger(tagged_words, tags) print 'lol' print '2',tags return tags
class TweetTagger(object): def __init__(self, sql=None, mongo=None, **kwargs): super(TweetTagger, self).__init__() if not sql: self._sql = SQLConnector(host=kwargs['host'], port=kwargs['port'], user=kwargs['user'], passwd=kwargs['password'], db=kwargs['db']) else: self._sql = sql if not mongo: self._mongo = MongoConnector(host=kwargs['H'], port=kwargs['mongoport'], db=kwargs['db']) else: self._mongo = mongo self._keyword = None self._bing = BingSearch() self._binged = Dictionary() def _tag(self, tweet): tweet_id = str(tweet[0]) original = tweet[1].decode('utf-8', 'ignore') text = original.lower().replace('#','').strip() #text = "download 60 hundred pounds 72 million $800 billion pounds holiday havoc v2 in itunes for free 99" urls = find_url(original) for url in urls: text = text.replace(url.lower(), "").strip() word_freqs = word_frequencies(text) #print word_freqs versions = find_version(text) words = regexp_tokenize(text, pattern=r'\w+([.,]\w+)*|\S+') prices = find_price(words) five_gram = self._create_ngram(tokenized=words, gram_length=5) tagged_tweet = self._ngram_tagger(five_gram, tweet_id) tagged_tweet.add('sentiment', tweet[2]) tagged_tweet.add('tweet', original) tagged_tweet.add('url', urls) tagged_tweet.add('version', versions) tagged_tweet.add('price', prices) if tagged_tweet.contains('software_name'): query = {'software_name':tagged_tweet.get('software_name')} words = {} for w in word_freqs: words['words.'+w] = word_freqs[w] #print query, words self._mongo.update_freqs(query,words) return tagged_tweet def _create_ngram(self, tokenized, gram_length): pos_ = pos(tokenized) #print pos_ gram = None while not gram: # In case tweet length less than gram_length gram = ngrams(pos_, gram_length) gram_length -= 1 return gram def _ngram_tagger(self, ngram, tweet_id): tags = Dictionary() tags.add('tweet_db_id', tweet_id) if self._keyword: keyword = self._keyword if self._sql.isSoftware(keyword): entry = self._sql.getSoftware() tags.add('software_name', keyword) tags.add('software_id', str(entry[0])) elif self._sql.isCompany(keyword): entry = self._sql.getCompany() tags.add('company_name', keyword) tags.add('company_id', str(entry[0])) elif self._sql.isOS(keyword): entry = self._sql.getOS() tags.add('os_name', keyword) tags.add('os_id', str(entry[0])) for tagged_words in ngram: self._tagger(tagged_words, tags) print 'lol' print '2',tags return tags def _tagger(self, gram, tags): words = [] tags_ = [] phrase = "" pos_soft = "" possible_software = False # Compile regular expressions outside of for loop # for efficiency purposes free_price = re.compile(r'^free$', re.IGNORECASE) check_is = re.compile(r'^is$|^for$', re.IGNORECASE) check_get = re.compile(r'^download$|^get$', re.IGNORECASE) check_on = re.compile(r'^on$|^for$', re.IGNORECASE) for tagged_word in gram: word = tagged_word[0] tag = tagged_word[1] phrase += word + " " #print word, tag try: if tagIsNoun(tag): if self._sql.isSoftware(word): entry = self._sql.getSoftware() try: prev_tag = tags_.pop() tags_.append(prev_tag) if not tagIsDeterminantOrPreposition(prev_tag): tags.add('software_name',word) tags.add('software_id', str(entry[0])) except: possible_software = True elif self._sql.isCompany(word): entry = self._sql.getCompany() try: prev_tag = tags_.pop() tags_.append(prev_tag) if not tagIsDeterminantOrPreposition(prev_tag): raise # Add to tags except: tags.add('company_name',word) tags.add('company_id', str(entry[0])) elif self._sql.isProgLang(word): entry = self._sql.getProgLang() try: prev_tag = tags_.pop() tags_.append(prev_tag) if not tagIsDeterminantOrPreposition(prev_tag): raise # Add to tags except: tags.add('programming_language_name', word) tags.add('programming_language_id', str(entry[0])) if self._sql.isOS(word): entry = self._sql.getOS() try: prev_tag = tags_.pop() tags_.append(prev_tag) prev = words.pop() words.append(prev) if not tagIsDeterminantOrPreposition(prev_tag) or re.match(check_on, prev): tags.add('os_name', word) tags.add('os_id', str(entry[0])) except: possible_software = True except ProgrammingError: pass if possible_software: if tagIsNoun(tag): pos_soft += word + " " if word == gram[len(gram)-1][0]: # If 'word' is last word in n-gram pos_soft = "" else: prev = words.pop() words.append(prev) if not re.match(check_get, prev): if check_version(word): tags.add('version', word) possible_software = False if re.match(free_price, word): try: prev = words.pop() words.append(prev) if re.match(check_is, prev): tags.add('price', word) else: prev = tags_.pop() tags_.append(prev) if tagIsNoun(prev): tags.add('price', word) except: # This is first word in phrase pass elif re.match(check_get, word): possible_software = True # Back in main part of loop words.append(word) tags_.append(tag) # End of for loop phrase = phrase.strip() if len(pos_soft) > 0: pos_soft = pos_soft.strip() if not tags.get('software_name'): if self._binged.contains(pos_soft): if self._binged.get(pos_soft): tags.add('software_name', pos_soft) else: try: bool_ = check_bing(pos_soft, self._bing) self._binged[pos_soft]= bool_ if bool_: # Insert into dictionary db? tags.add('software_name', pos_soft) except ServerError, e: print e raise IncompleteTaggingError() print '1',tags
if not best_acc or acc > best_acc: with open(MODEL_PATH % (dev_loss, acc), 'wb') as f: torch.save(model, f) best_acc = acc stop_counter = 0 else: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.2 if EARLY_STOP != 0: stop_counter += 1 return stop_counter dictionary = Dictionary(path=DICT_PATH) n_token = len(dictionary) best_dev_loss = None best_acc = None model = Classifier({ 'dropout': DROPOUT, 'n_tokens': n_token, 'n_layers': N_LAYERS, 'hidden_dim': HIDDEN_DIM, 'embed_dim': EMBED_DIM, 'pooling': POOLING, 'dictionary': dictionary, 'pretrained_wordvec': PRETRAINED_WORDVEC,
import jieba import jieba.posseg #需要另外加载一个词性标注模块 import pandas as pd # from nltk.tokenize import word_tokenize P2P = 'total' # ********************************************************************* # datapath = '../../data/' + P2P + '/' + P2P + '.csv' outpath = '../../data/' + P2P + '/data(%s).json' dictpath = '../../data/' + P2P + '/mydict(%s).json' debug_flag = False stop = False # ********************************************************************* # mydict = Dictionary() mydict.add_word('<pad>') # mydict.add_word('<unk>') stopping_word = open('../../data/stopping_word', 'r', encoding='utf-8').readlines() for i in range(len(stopping_word)): stopping_word[i] = stopping_word[i].strip() reviews = pd.read_csv(datapath, index_col=0, header=0, encoding='utf-8') labels = list(reviews['reviewEvaluation']) reviews = list(reviews['reviewContent']) # reviews = open(datapath).readlines() n_reviews = len(reviews) print('%d条评论将被载入...' % n_reviews) if debug_flag:
path = 'wili' print(os.listdir(path)) # Init random seed to get reproducible results seed = 1111 random.seed(seed) np.random.RandomState(seed) torch.manual_seed(seed) # Any results you write to the current directory are saved as output. x_train_full = open(path + "/x_train.txt").read().splitlines() y_train_full = open(path + "/y_train.txt").read().splitlines() print('Example:') print('LANG =', y_train_full[0]) print('TEXT =', x_train_full[0]) char_vocab = Dictionary() pad_token = '<pad>' # reserve index 0 for padding unk_token = '<unk>' # reserve index 1 for unknown token pad_index = char_vocab.add_token(pad_token) unk_index = char_vocab.add_token(unk_token) # join all the training sentences in a single string # and obtain the list of different characters with set chars = set(''.join(x_train_full)) for char in sorted(chars): char_vocab.add_token(char) print("Vocabulary:", len(char_vocab), "UTF characters") lang_vocab = Dictionary() # use python set to obtain the list of languages without repetitions languages = set(y_train_full)
from io import BytesIO from PIL import Image from utils import Dictionary, get_words, load_model import base64 import difflib import numpy as np import torch.nn.functional as F import torch net_recog = load_model('../recog_params.pkl') ug_dict = Dictionary('../../../data/ug_words.txt') def img2str(pic): # pic: numpy array figfile = BytesIO() pic = Image.fromarray(pic).convert('RGBA').save(figfile, format='PNG') figfile.seek(0, 0) figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii') return figdata_png def compute(test_id='test'): pic_path = 'static/' + test_id + '.png' truth_path = 'static/' + test_id + '.txt' base_image, pic_with_box, word_pics = get_words(pic_path) word_pics = np.stack(word_pics) pred = F.softmax(net_recog(torch.from_numpy(word_pics)), dim=1) _, idxes = torch.max(pred, dim=1) res = []
print(args) start_time = time.time() corpus_output_name = args.save + "corpus_index" dictionary_output_name = args.save + "dictionary_index" if not os.path.exists(args.save): os.makedirs(args.save) f_out = open(corpus_output_name, 'w') my_open = open byte_mode = False dict_c = Dictionary(byte_mode) total_num_w = 0 with my_open(args.data, 'r') as f_in: for line_idx, line in enumerate(f_in): sent_spacy, gpt2_idx, gpt2_mapping = line.rstrip().split('\t') w_ind_list = [] for w in sent_spacy.split(): w_ind = dict_c.dict_check_add(w) w_ind_list.append(w_ind) total_num_w += 1 dict_c.append_eos(w_ind_list) f_out.write(' '.join([str(x) for x in w_ind_list]) + '\t' + gpt2_idx + '\t' + gpt2_mapping + '\n') if line_idx % 1000000 == 0: print(line_idx) sys.stdout.flush()
lines.append(indices) print("Number of sentences dropped from {}: {} out of {} total". format(path, dropped, linecount)) return lines def __index__(self, i): return self.Item(self.text[i], self.hiddens[i], self.labels[i]) model_args, idx2word, autoencoder, gan_gen, gan_disc \ = load_models(args.load_path) # print(idx2word) word2idx = {word : index for index, word in idx2word.items()} dic = Dictionary() dic.word2idx = word2idx dic.idx2word = idx2word corpus = EncodeCopus(args.inf, maxlen=-1, dictionary=dic) batches = corpus.get_batches() autoencoder.cuda() autoencoder.eval() with open(args.inf + '.corpus', 'wb') as b: hiddens = [] for index, (source, target, length) in enumerate(batches): # print(source, length) hidden = autoencoder.encode(Variable(source), length,None) # print(hidden)
def __init__(self, corpus_dir, w2v, dictionary=None, w2v_lbound=16, w2v_ubound=2**16, corpus_lbound=2, ctx_len=12, pad=0, is_wikitext=False, is_chimera=False, is_jnlpba=False): if dictionary is None: dictionary = Dictionary(w2v.vector_size) if is_wikitext: corpus = [ fi.lower().split() for fi in (corpus_dir / 'wiki.train.tokens').open().readlines() ] corpus += [ fi.lower().split() for fi in (corpus_dir / 'wiki.valid.tokens').open().readlines() ] corpus += [ fi.lower().split() for fi in (corpus_dir / 'wiki.test.tokens').open().readlines() ] corpus = np.array(corpus) elif is_chimera: corpus = [] with (corpus_dir / 'dataset.txt').open(encoding='latin1') as f: lines = f.readlines()[1:] for i in range(0, len(lines), 2): fields = lines[i].rstrip('\n').split('\t') nonce = fields[1].lower() sents = fields[3].lower().split('@@') pivot_comp = lines[i + 1].split('\t')[5].lower().split('_') corpus += [ sent.replace(nonce, pivot_comp[0 if i % 2 == 0 else 1]).split() for i, sent in enumerate(sents) ] corpus = np.unique(corpus) elif is_jnlpba: ps = ['train/Genia4ERtask2.iob2', 'test/Genia4EReval2.iob2'] corpus = [] sent = [] for p in ps: for w in (corpus_dir / p).open().readlines(): if w.startswith("###MEDLINE:"): if sent: corpus += [sent] sent = [] continue w = w.strip().lower() if w != '': w = w.split() w = w[0] sent += [w] corpus += [sent] corpus = np.array(corpus) print(f"Corpus shape: {corpus.shape}") word_count = defaultdict(int) oov_words = [] oov_dataset = {} for sent in corpus: for w in sent: word_count[w] += 1 dictionary.add_word(w, w2v) if w not in oov_dataset and w not in dictionary.word2idx: if w in string.punctuation: continue oov_words.append(w) oov_dataset[w] = [[], []] words = [] for w in dictionary.word2idx: if w != '<unk>' and w2v_ubound > w2v.wv.vocab[ w].count > w2v_lbound and word_count[w] > corpus_lbound: words.append(w) print(f"Number of valid words: {len(words)}") train_dataset = {} valid_dataset = {} for w, prob in zip(words, np.random.random(len(words))): if prob < 0.9: train_dataset[w] = [[], []] else: valid_dataset[w] = [[], []] for sent in corpus: words_valid = [] words_train = [] words_oov = [] for idx, w in enumerate(sent): if w in valid_dataset: words_valid += [[w, idx]] elif w in train_dataset: words_train += [[w, idx]] elif w in oov_dataset: words_oov += [[w, idx]] if len(words_valid) > 0 or len(words_train) > 0 or len( words_oov) > 0: sent_word_ids = dictionary.sent2idx(sent) if len(words_valid) > 0: for w, idx in words_valid: if np.count_nonzero( sent_word_ids[idx - ctx_len:idx + 1 + ctx_len]) > ctx_len: valid_dataset[w][0] += [ sent_word_ids[idx - ctx_len:idx] ] valid_dataset[w][1] += [ sent_word_ids[idx + 1:idx + 1 + ctx_len] ] if len(words_train) > 0: for w, idx in words_train: if np.count_nonzero( sent_word_ids[idx - ctx_len:idx + 1 + ctx_len]) > ctx_len: train_dataset[w][0] += [ sent_word_ids[idx - ctx_len:idx] ] train_dataset[w][1] += [ sent_word_ids[idx + 1:idx + 1 + ctx_len] ] if len(words_oov) > 0: for w, idx in words_oov: if np.count_nonzero( sent_word_ids[idx - ctx_len:idx + 1 + ctx_len]) > ctx_len: oov_dataset[w][0] += [ sent_word_ids[idx - ctx_len:idx] ] oov_dataset[w][1] += [ sent_word_ids[idx + 1:idx + 1 + ctx_len] ] for w in valid_dataset: lefts = pad_sequences(valid_dataset[w][0], max_len=ctx_len, pad=pad, pre=True) rights = pad_sequences(valid_dataset[w][1], max_len=ctx_len, pad=pad, pre=False) valid_dataset[w] = np.concatenate((lefts, rights), axis=1) for w in train_dataset: lefts = pad_sequences(train_dataset[w][0], max_len=ctx_len, pad=pad, pre=True) rights = pad_sequences(train_dataset[w][1], max_len=ctx_len, pad=pad, pre=False) train_dataset[w] = np.concatenate((lefts, rights), axis=1) for w in oov_dataset: lefts = pad_sequences(oov_dataset[w][0], max_len=ctx_len, pad=pad, pre=True) rights = pad_sequences(oov_dataset[w][1], max_len=ctx_len, pad=pad, pre=False) oov_dataset[w] = np.concatenate((lefts, rights), axis=1) print(f"Train size: {len(train_dataset.keys())}") print(f"Valid size: {len(valid_dataset.keys())}") print(f"OOV size: {len(oov_words)}") print( f"Train >0 ctxts size: {len([w for w in train_dataset.keys() if len(train_dataset[w]) > 0])}" ) print( f"Valid >0 ctxts size: {len([w for w in valid_dataset.keys() if len(valid_dataset[w]) > 0])}" ) print( f"OOV >0 ctxts size: {len([w for w in oov_words if len(oov_dataset[w]) > 0])}" ) train_ctxt_lens = [len(train_dataset[w]) for w in train_dataset.keys()] valid_ctxt_lens = [len(valid_dataset[w]) for w in valid_dataset.keys()] oov_ctxt_lens = [len(oov_dataset[w]) for w in oov_words] print( f"Number of Train with ctxts size = index {[train_ctxt_lens.count(i) for i in range(10)]}" ) print( f"Number of Valid with ctxts size = index {[valid_ctxt_lens.count(i) for i in range(10)]}" ) print( f"Number of OOV with ctxts size = index {[oov_ctxt_lens.count(i) for i in range(10)]}" ) oov_word_counts = [word_count[w] for w in oov_words] inds = np.argsort(-np.array(oov_word_counts))[:10] print( f"Number of OOV words with count = index {[oov_word_counts.count(i) for i in range(10)]}" ) print(f"Most frequent OOV words: {[oov_words[i] for i in inds]} " f"frequencies: {[oov_word_counts[i] for i in inds]}") self.dictionary = dictionary self.train_dataset = train_dataset self.train_words = list(train_dataset.keys()) self.valid_dataset = valid_dataset self.valid_words = list(valid_dataset.keys()) self.oov_dataset = oov_dataset self.oov_words = list(oov_dataset.keys()) self.w2v = w2v self.ctx_len = ctx_len self.train_k2words = {} self.valid_k2words = {}
logging.info('batch size: {}'.format(args.batch_size)) if args.algo == 'FL': logging.info('epochs: {}'.format(args.train_epochs)) logging.info('*' * 52) if args.model == 'LSTM': # load shakespeare data (client: 715) d_train = load_data('shakespeare_train.h5', args) d_test = load_data('shakespeare_test.h5', args) # split data into two parts: support client / test client support_train_str, support_test_str, test_train_str, test_test_str = split_data( d_train, d_test, args) # preprocess data and construct a dictionary for whole data corpus = Dictionary() support_train, support_test = lstm_data_process( support_train_str, support_test_str, corpus, args) test_train, test_test = lstm_data_process(test_train_str, test_test_str, corpus, args) args.ntokens = len(corpus) args.corpus = corpus if args.algo == 'FL': FL_LSTM(support_train, support_test, test_train, test_test, args) elif args.algo == 'FR': FR_LSTM(support_train, support_test, test_train, test_test, args) else: # load MNIST data (client: 3383 / train: 341873 / test: 40832) d_train = load_data('fed_emnist_digitsonly_train.h5', args) d_test = load_data('fed_emnist_digitsonly_test.h5', args)