def __init__(self):
        self.data = []
        self.dictionary = Dictionary()
        self.max_sent_len = 0

        # Read the positive reviews
        with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f:
            positive_reviews = f.readlines()
        for review in positive_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 1))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Read the negative reviews
        with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f:
            negative_reviews = f.readlines()
        for review in negative_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 0))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Split the original dataset into train/test
        random.shuffle(self.data)
        split_index = int(0.9 * len(self.data))
        self.train = AugmentedList(self.data[:split_index])
        self.test = AugmentedList(self.data[split_index:])
Example #2
0
class ReviewDataset:
    def __init__(self):
        self.data = []
        self.dictionary = Dictionary()
        self.max_sent_len = 0

        # Read the positive reviews
        with open(POSITIVE_REVIEWS_FILE, encoding='utf-8') as f:
            positive_reviews = f.readlines()
        for review in positive_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 1))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Read the negative reviews
        with open(NEGAGIVE_REVIEWS_FILE, encoding='utf-8') as f:
            negative_reviews = f.readlines()
        for review in negative_reviews:
            review = normalize_string(review)
            review_words = word_tokenize(review)
            self.dictionary.add_words(review_words)
            self.data.append((review, 0))
            self.max_sent_len = max(self.max_sent_len, 2 + len(review_words))

        # Split the original dataset into train/test
        random.shuffle(self.data)
        split_index = int(0.9 * len(self.data))
        self.train = AugmentedList(self.data[:split_index])
        self.test = AugmentedList(self.data[split_index:])

    def next_batch(self, batch_size, mode=TRAIN_MODE):
        review_lengths, reviews, targets = [], [], []
        data = self.train if mode == TRAIN_MODE else self.test
        batch = data.next_items(batch_size)
        for (review, target) in batch:
            review_length = len(word_tokenize(normalize_string(review)))
            review = indexes_from_sentence(review, self.dictionary,
                                           self.max_sent_len)
            target = one_hot_encoding(2, target)
            reviews.append(review)
            targets.append(target)
            review_lengths.append(review_length)
        return review_lengths, reviews, targets
Example #3
0
    def __init__(self,
                 video_path,
                 translation_path,
                 spatial_transform=None,
                 temporal_transform=None,
                 sample_duration=4,
                 get_loader=get_default_video_loader):
        self.dictionary = Dictionary()
        self.data = make_dataset(video_path, translation_path, sample_duration,
                                 self.dictionary)

        self.spatial_transform = spatial_transform
        self.temporal_transform = temporal_transform
        self.loader = get_loader()
Example #4
0
 def __init__(self, sql=None, mongo=None, **kwargs):
     super(TweetTagger, self).__init__()
     if not sql:
         self._sql = SQLConnector(host=kwargs['host'],
                                  port=kwargs['port'],
                                  user=kwargs['user'],
                                  passwd=kwargs['password'],
                                  db=kwargs['db'])
     else:
         self._sql = sql
     if not mongo:
         self._mongo = MongoConnector(host=kwargs['H'], port=kwargs['mongoport'], db=kwargs['db'])
     else:
         self._mongo = mongo
     self._keyword = None
     self._bing = BingSearch()
     self._binged = Dictionary()
Example #5
0
    def __init__(self, path, maxlen, vocab_size=11000, lowercase=False, dictionary=None):
        self.dictionary = Dictionary()
        self.maxlen = maxlen
        if maxlen  == -1:
            self.maxlen = np.inf
        self.lowercase = lowercase
        self.vocab_size = vocab_size
        self.train_path = path
        self.text = []
        self.hiddens = []
        self.labels = []
        # make the vocabulary from training set
        if dictionary is None:
            self.make_vocab()
        else:
            self.dictionary = dictionary

        self.train = self.tokenize(self.train_path)
        self.Item = namedtuple('Item', ['text', 'hidden', 'label'])
Example #6
0
def tag_tweets(ngrams, tweet_id):
    tweet = Dictionary()
    tweet.add("tweet_db_id", tweet_id)
    prev_is_software = False
    for i in range(len(ngrams), 0, -1):
        for word in ngrams[i]:
            if prev_is_software:
                if check_version(word):
                    tweet.add("version", word)
                prev_is_software = False
            # Look for 'Get x free'
            # This doesn't always work, eg 'get your free ...' / 'get it free'
            # TODO: Also look for 'Get x on' etc
            # Also look for 'Download x now' etc
            elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Ff][Rr][Ee][Ee]$", word):
                software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip()
                software = software.replace(re.findall(re.compile(r"[Ff][Rr][Ee][Ee]$"), word)[0], "").strip()
                if not sql.isSoftware(software):
                    try:
                        if check_bing(software, bing):
                            # Add newly-found software names to list, add to dictionary at end
                            new_software.add(software, tweet)
                            possible_tags.append(tweet_id)
                            # sql.insertSoftware(software) # This task now done at end
                    except ServerError, e:
                        print e
                        raise IncompleteTaggingError()
                tweet.add("price", "free")

            # REQUIRES REFACTORING
            elif re.match(r"^[Gg][Ee][Tt][\w.\s]*[Nn][Oo][Ww]$", word):
                software = word.replace(re.findall(re.compile(r"^[Gg][Ee][Tt]"), word)[0], "").strip()
                software = software.replace(re.findall(re.compile(r"[Nn][Oo][Ww]$"), word)[0], "").strip()
                if not sql.isSoftware(software):
                    try:
                        if check_bing(software, bing):
                            # Add newly-found software names to list, add to dictionary at end
                            new_software.add(software, tweet)
                            possible_tags.append(tweet_id)
                    except ServerError, e:
                        print e
                        raise IncompleteTaggingError()
Example #7
0
    def _ngram_tagger(self, ngram, tweet_id):
        tags = Dictionary()
        tags.add('tweet_db_id', tweet_id)
        if self._keyword:
            keyword = self._keyword
            if self._sql.isSoftware(keyword):
                entry = self._sql.getSoftware()
                tags.add('software_name', keyword)
                tags.add('software_id', str(entry[0]))
            elif self._sql.isCompany(keyword):
                entry = self._sql.getCompany()
                tags.add('company_name', keyword)
                tags.add('company_id', str(entry[0]))
            elif self._sql.isOS(keyword):
                entry = self._sql.getOS()
                tags.add('os_name', keyword)
                tags.add('os_id', str(entry[0]))

        for tagged_words in ngram:
            self._tagger(tagged_words, tags)
        print 'lol'
        print '2',tags
        return tags
Example #8
0
class TweetTagger(object):
    def __init__(self, sql=None, mongo=None, **kwargs):
        super(TweetTagger, self).__init__()
        if not sql:
            self._sql = SQLConnector(host=kwargs['host'],
                                     port=kwargs['port'],
                                     user=kwargs['user'],
                                     passwd=kwargs['password'],
                                     db=kwargs['db'])
        else:
            self._sql = sql
        if not mongo:
            self._mongo = MongoConnector(host=kwargs['H'], port=kwargs['mongoport'], db=kwargs['db'])
        else:
            self._mongo = mongo
        self._keyword = None
        self._bing = BingSearch()
        self._binged = Dictionary()

    def _tag(self, tweet):
        tweet_id = str(tweet[0])
        original = tweet[1].decode('utf-8', 'ignore')
        text = original.lower().replace('#','').strip()
        #text = "download 60 hundred pounds 72 million $800 billion pounds holiday havoc v2 in itunes for free 99"

        urls = find_url(original)
        for url in urls:
            text = text.replace(url.lower(), "").strip()

        word_freqs = word_frequencies(text)
        #print word_freqs

        versions = find_version(text)

        words = regexp_tokenize(text, pattern=r'\w+([.,]\w+)*|\S+')
        prices = find_price(words)

        five_gram = self._create_ngram(tokenized=words, gram_length=5)

        tagged_tweet = self._ngram_tagger(five_gram, tweet_id)
        tagged_tweet.add('sentiment', tweet[2])
        tagged_tweet.add('tweet', original)
        tagged_tweet.add('url', urls)
        tagged_tweet.add('version', versions)
        tagged_tweet.add('price', prices)

        if tagged_tweet.contains('software_name'):
            query = {'software_name':tagged_tweet.get('software_name')}
            words = {}
            for w in word_freqs:
                words['words.'+w] = word_freqs[w]
            #print query, words
            self._mongo.update_freqs(query,words)

        return tagged_tweet

    def _create_ngram(self, tokenized, gram_length):
        pos_ = pos(tokenized)
        #print pos_
        gram = None
        while not gram: # In case tweet length less than gram_length
            gram = ngrams(pos_, gram_length)
            gram_length -= 1
        return gram

    def _ngram_tagger(self, ngram, tweet_id):
        tags = Dictionary()
        tags.add('tweet_db_id', tweet_id)
        if self._keyword:
            keyword = self._keyword
            if self._sql.isSoftware(keyword):
                entry = self._sql.getSoftware()
                tags.add('software_name', keyword)
                tags.add('software_id', str(entry[0]))
            elif self._sql.isCompany(keyword):
                entry = self._sql.getCompany()
                tags.add('company_name', keyword)
                tags.add('company_id', str(entry[0]))
            elif self._sql.isOS(keyword):
                entry = self._sql.getOS()
                tags.add('os_name', keyword)
                tags.add('os_id', str(entry[0]))

        for tagged_words in ngram:
            self._tagger(tagged_words, tags)
        print 'lol'
        print '2',tags
        return tags

    def _tagger(self, gram, tags):
        words = []
        tags_ = []
        phrase = ""
        pos_soft = ""
        possible_software = False
        # Compile regular expressions outside of for loop
        # for efficiency purposes
        free_price = re.compile(r'^free$', re.IGNORECASE)
        check_is = re.compile(r'^is$|^for$', re.IGNORECASE)
        check_get = re.compile(r'^download$|^get$', re.IGNORECASE)
        check_on = re.compile(r'^on$|^for$', re.IGNORECASE)
        for tagged_word in gram:
            word = tagged_word[0]
            tag = tagged_word[1]
            phrase += word + " "
            #print word, tag
            try:
                if tagIsNoun(tag):
                    if self._sql.isSoftware(word):
                        entry = self._sql.getSoftware()
                        try:
                            prev_tag = tags_.pop()
                            tags_.append(prev_tag)
                            if not tagIsDeterminantOrPreposition(prev_tag):
                                tags.add('software_name',word)
                                tags.add('software_id', str(entry[0]))
                        except:
                            possible_software = True
                    elif self._sql.isCompany(word):
                        entry = self._sql.getCompany()
                        try:
                            prev_tag = tags_.pop()
                            tags_.append(prev_tag)
                            if not tagIsDeterminantOrPreposition(prev_tag):
                                raise # Add to tags
                        except:
                            tags.add('company_name',word)
                            tags.add('company_id', str(entry[0]))
                    elif self._sql.isProgLang(word):
                        entry = self._sql.getProgLang()
                        try:
                            prev_tag = tags_.pop()
                            tags_.append(prev_tag)
                            if not tagIsDeterminantOrPreposition(prev_tag):
                                raise # Add to tags
                        except:
                            tags.add('programming_language_name', word)
                            tags.add('programming_language_id', str(entry[0]))

                if self._sql.isOS(word):
                    entry = self._sql.getOS()
                    try:
                        prev_tag = tags_.pop()
                        tags_.append(prev_tag)
                        prev = words.pop()
                        words.append(prev)
                        if not tagIsDeterminantOrPreposition(prev_tag) or re.match(check_on, prev):
                            tags.add('os_name', word)
                            tags.add('os_id', str(entry[0]))
                    except:
                        possible_software = True
            except ProgrammingError:
                pass

            if possible_software:
                if tagIsNoun(tag):
                    pos_soft += word + " "
                    if word == gram[len(gram)-1][0]: # If 'word' is last word in n-gram
                        pos_soft = ""
                else:
                    prev = words.pop()
                    words.append(prev)
                    if not re.match(check_get, prev):
                        if check_version(word):
                            tags.add('version', word)
                    possible_software = False
            if re.match(free_price, word):
                try:
                    prev = words.pop()
                    words.append(prev)
                    if re.match(check_is, prev):
                        tags.add('price', word)
                    else:
                        prev = tags_.pop()
                        tags_.append(prev)
                        if tagIsNoun(prev):
                            tags.add('price', word)
                except:
                    # This is first word in phrase
                    pass
            elif re.match(check_get, word):
                possible_software = True

            # Back in main part of loop
            words.append(word)
            tags_.append(tag)

        # End of for loop
        phrase = phrase.strip()
        if len(pos_soft) > 0:
            pos_soft = pos_soft.strip()
            if not tags.get('software_name'):
                if self._binged.contains(pos_soft):
                    if self._binged.get(pos_soft):
                        tags.add('software_name', pos_soft)
                else:
                    try:
                        bool_ = check_bing(pos_soft, self._bing)              
                        self._binged[pos_soft]=  bool_
                        if bool_:
                            # Insert into dictionary db?
                            tags.add('software_name', pos_soft)
                    except ServerError, e:
                        print e
                        raise IncompleteTaggingError()
        print '1',tags
Example #9
0
    if not best_acc or acc > best_acc:
        with open(MODEL_PATH % (dev_loss, acc), 'wb') as f:
            torch.save(model, f)
        best_acc = acc
        stop_counter = 0
    else:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.2
        if EARLY_STOP != 0:
            stop_counter += 1

    return stop_counter


dictionary = Dictionary(path=DICT_PATH)

n_token = len(dictionary)

best_dev_loss = None
best_acc = None

model = Classifier({
    'dropout': DROPOUT,
    'n_tokens': n_token,
    'n_layers': N_LAYERS,
    'hidden_dim': HIDDEN_DIM,
    'embed_dim': EMBED_DIM,
    'pooling': POOLING,
    'dictionary': dictionary,
    'pretrained_wordvec': PRETRAINED_WORDVEC,
Example #10
0
import jieba
import jieba.posseg  #需要另外加载一个词性标注模块

import pandas as pd
# from nltk.tokenize import word_tokenize

P2P = 'total'
# ********************************************************************* #
datapath = '../../data/' + P2P + '/' + P2P + '.csv'
outpath = '../../data/' + P2P + '/data(%s).json'
dictpath = '../../data/' + P2P + '/mydict(%s).json'
debug_flag = False
stop = False
# ********************************************************************* #

mydict = Dictionary()
mydict.add_word('<pad>')
# mydict.add_word('<unk>')
stopping_word = open('../../data/stopping_word', 'r',
                     encoding='utf-8').readlines()
for i in range(len(stopping_word)):
    stopping_word[i] = stopping_word[i].strip()

reviews = pd.read_csv(datapath, index_col=0, header=0, encoding='utf-8')
labels = list(reviews['reviewEvaluation'])
reviews = list(reviews['reviewContent'])
# reviews = open(datapath).readlines()
n_reviews = len(reviews)
print('%d条评论将被载入...' % n_reviews)

if debug_flag:
Example #11
0
    path = 'wili'
    print(os.listdir(path))

    # Init random seed to get reproducible results
    seed = 1111
    random.seed(seed)
    np.random.RandomState(seed)
    torch.manual_seed(seed)

    # Any results you write to the current directory are saved as output.
    x_train_full = open(path + "/x_train.txt").read().splitlines()
    y_train_full = open(path + "/y_train.txt").read().splitlines()
    print('Example:')
    print('LANG =', y_train_full[0])
    print('TEXT =', x_train_full[0])
    char_vocab = Dictionary()
    pad_token = '<pad>'  # reserve index 0 for padding
    unk_token = '<unk>'  # reserve index 1 for unknown token
    pad_index = char_vocab.add_token(pad_token)
    unk_index = char_vocab.add_token(unk_token)

    # join all the training sentences in a single string
    # and obtain the list of different characters with set
    chars = set(''.join(x_train_full))
    for char in sorted(chars):
        char_vocab.add_token(char)
    print("Vocabulary:", len(char_vocab), "UTF characters")

    lang_vocab = Dictionary()
    # use python set to obtain the list of languages without repetitions
    languages = set(y_train_full)
Example #12
0
from io import BytesIO
from PIL import Image
from utils import Dictionary, get_words, load_model
import base64
import difflib
import numpy as np
import torch.nn.functional as F
import torch

net_recog = load_model('../recog_params.pkl')
ug_dict = Dictionary('../../../data/ug_words.txt')


def img2str(pic):
    # pic: numpy array
    figfile = BytesIO()
    pic = Image.fromarray(pic).convert('RGBA').save(figfile, format='PNG')
    figfile.seek(0, 0)
    figdata_png = base64.b64encode(figfile.getvalue()).decode('ascii')
    return figdata_png


def compute(test_id='test'):
    pic_path = 'static/' + test_id + '.png'
    truth_path = 'static/' + test_id + '.txt'
    base_image, pic_with_box, word_pics = get_words(pic_path)
    word_pics = np.stack(word_pics)
    pred = F.softmax(net_recog(torch.from_numpy(word_pics)), dim=1)
    _, idxes = torch.max(pred, dim=1)

    res = []
print(args)

start_time = time.time()

corpus_output_name = args.save + "corpus_index"
dictionary_output_name = args.save + "dictionary_index"

if not os.path.exists(args.save):
    os.makedirs(args.save)

f_out = open(corpus_output_name, 'w')

my_open = open
byte_mode = False
        
dict_c = Dictionary(byte_mode)

total_num_w = 0
with my_open(args.data, 'r') as f_in:
    for line_idx, line in enumerate(f_in):
        sent_spacy, gpt2_idx, gpt2_mapping = line.rstrip().split('\t')
        w_ind_list = []
        for w in sent_spacy.split():
            w_ind = dict_c.dict_check_add(w)
            w_ind_list.append(w_ind)
            total_num_w += 1
        dict_c.append_eos(w_ind_list)
        f_out.write(' '.join([str(x) for x in w_ind_list]) + '\t' + gpt2_idx + '\t' + gpt2_mapping + '\n')
        if line_idx % 1000000 == 0:
            print(line_idx)
            sys.stdout.flush()
Example #14
0
                lines.append(indices)

        print("Number of sentences dropped from {}: {} out of {} total".
              format(path, dropped, linecount))
        return lines

    def __index__(self, i):
        return self.Item(self.text[i], self.hiddens[i], self.labels[i])

model_args, idx2word, autoencoder, gan_gen, gan_disc \
        = load_models(args.load_path)
# print(idx2word)

word2idx = {word : index for index, word in idx2word.items()}

dic = Dictionary()
dic.word2idx = word2idx
dic.idx2word = idx2word

corpus = EncodeCopus(args.inf, maxlen=-1, dictionary=dic)

batches = corpus.get_batches()
autoencoder.cuda()
autoencoder.eval()

with open(args.inf + '.corpus', 'wb') as b:
    hiddens = []
    for index, (source, target, length) in enumerate(batches):
        # print(source, length)
        hidden = autoencoder.encode(Variable(source), length,None)
        # print(hidden)
Example #15
0
    def __init__(self,
                 corpus_dir,
                 w2v,
                 dictionary=None,
                 w2v_lbound=16,
                 w2v_ubound=2**16,
                 corpus_lbound=2,
                 ctx_len=12,
                 pad=0,
                 is_wikitext=False,
                 is_chimera=False,
                 is_jnlpba=False):
        if dictionary is None:
            dictionary = Dictionary(w2v.vector_size)

        if is_wikitext:
            corpus = [
                fi.lower().split()
                for fi in (corpus_dir /
                           'wiki.train.tokens').open().readlines()
            ]
            corpus += [
                fi.lower().split()
                for fi in (corpus_dir /
                           'wiki.valid.tokens').open().readlines()
            ]
            corpus += [
                fi.lower().split()
                for fi in (corpus_dir / 'wiki.test.tokens').open().readlines()
            ]
            corpus = np.array(corpus)
        elif is_chimera:
            corpus = []
            with (corpus_dir / 'dataset.txt').open(encoding='latin1') as f:
                lines = f.readlines()[1:]
                for i in range(0, len(lines), 2):
                    fields = lines[i].rstrip('\n').split('\t')
                    nonce = fields[1].lower()
                    sents = fields[3].lower().split('@@')
                    pivot_comp = lines[i + 1].split('\t')[5].lower().split('_')
                    corpus += [
                        sent.replace(nonce, pivot_comp[0 if i %
                                                       2 == 0 else 1]).split()
                        for i, sent in enumerate(sents)
                    ]
            corpus = np.unique(corpus)
        elif is_jnlpba:
            ps = ['train/Genia4ERtask2.iob2', 'test/Genia4EReval2.iob2']
            corpus = []
            sent = []
            for p in ps:
                for w in (corpus_dir / p).open().readlines():
                    if w.startswith("###MEDLINE:"):
                        if sent:
                            corpus += [sent]
                        sent = []
                        continue

                    w = w.strip().lower()
                    if w != '':
                        w = w.split()
                        w = w[0]
                        sent += [w]
                corpus += [sent]
            corpus = np.array(corpus)
        print(f"Corpus shape: {corpus.shape}")

        word_count = defaultdict(int)
        oov_words = []
        oov_dataset = {}
        for sent in corpus:
            for w in sent:
                word_count[w] += 1
                dictionary.add_word(w, w2v)
                if w not in oov_dataset and w not in dictionary.word2idx:
                    if w in string.punctuation:
                        continue
                    oov_words.append(w)
                    oov_dataset[w] = [[], []]

        words = []
        for w in dictionary.word2idx:
            if w != '<unk>' and w2v_ubound > w2v.wv.vocab[
                    w].count > w2v_lbound and word_count[w] > corpus_lbound:
                words.append(w)
        print(f"Number of valid words: {len(words)}")

        train_dataset = {}
        valid_dataset = {}
        for w, prob in zip(words, np.random.random(len(words))):
            if prob < 0.9:
                train_dataset[w] = [[], []]
            else:
                valid_dataset[w] = [[], []]

        for sent in corpus:
            words_valid = []
            words_train = []
            words_oov = []

            for idx, w in enumerate(sent):
                if w in valid_dataset:
                    words_valid += [[w, idx]]
                elif w in train_dataset:
                    words_train += [[w, idx]]
                elif w in oov_dataset:
                    words_oov += [[w, idx]]

            if len(words_valid) > 0 or len(words_train) > 0 or len(
                    words_oov) > 0:
                sent_word_ids = dictionary.sent2idx(sent)

                if len(words_valid) > 0:
                    for w, idx in words_valid:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            valid_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            valid_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

                if len(words_train) > 0:
                    for w, idx in words_train:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            train_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            train_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

                if len(words_oov) > 0:
                    for w, idx in words_oov:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            oov_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            oov_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

        for w in valid_dataset:
            lefts = pad_sequences(valid_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(valid_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            valid_dataset[w] = np.concatenate((lefts, rights), axis=1)

        for w in train_dataset:
            lefts = pad_sequences(train_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(train_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            train_dataset[w] = np.concatenate((lefts, rights), axis=1)

        for w in oov_dataset:
            lefts = pad_sequences(oov_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(oov_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            oov_dataset[w] = np.concatenate((lefts, rights), axis=1)

        print(f"Train size: {len(train_dataset.keys())}")
        print(f"Valid size: {len(valid_dataset.keys())}")
        print(f"OOV size: {len(oov_words)}")

        print(
            f"Train >0 ctxts size: {len([w for w in train_dataset.keys() if len(train_dataset[w]) > 0])}"
        )
        print(
            f"Valid >0 ctxts size: {len([w for w in valid_dataset.keys() if len(valid_dataset[w]) > 0])}"
        )
        print(
            f"OOV >0 ctxts size: {len([w for w in oov_words if len(oov_dataset[w]) > 0])}"
        )

        train_ctxt_lens = [len(train_dataset[w]) for w in train_dataset.keys()]
        valid_ctxt_lens = [len(valid_dataset[w]) for w in valid_dataset.keys()]
        oov_ctxt_lens = [len(oov_dataset[w]) for w in oov_words]

        print(
            f"Number of Train with ctxts size = index {[train_ctxt_lens.count(i) for i in range(10)]}"
        )
        print(
            f"Number of Valid with ctxts size = index {[valid_ctxt_lens.count(i) for i in range(10)]}"
        )
        print(
            f"Number of OOV with ctxts size = index {[oov_ctxt_lens.count(i) for i in range(10)]}"
        )

        oov_word_counts = [word_count[w] for w in oov_words]
        inds = np.argsort(-np.array(oov_word_counts))[:10]

        print(
            f"Number of OOV words with count = index {[oov_word_counts.count(i) for i in range(10)]}"
        )
        print(f"Most frequent OOV words: {[oov_words[i] for i in inds]} "
              f"frequencies: {[oov_word_counts[i] for i in inds]}")

        self.dictionary = dictionary
        self.train_dataset = train_dataset
        self.train_words = list(train_dataset.keys())
        self.valid_dataset = valid_dataset
        self.valid_words = list(valid_dataset.keys())
        self.oov_dataset = oov_dataset
        self.oov_words = list(oov_dataset.keys())
        self.w2v = w2v
        self.ctx_len = ctx_len
        self.train_k2words = {}
        self.valid_k2words = {}
Example #16
0
    logging.info('batch size: {}'.format(args.batch_size))
    if args.algo == 'FL':
        logging.info('epochs: {}'.format(args.train_epochs))
    logging.info('*' * 52)

    if args.model == 'LSTM':
        # load shakespeare data (client: 715)
        d_train = load_data('shakespeare_train.h5', args)
        d_test = load_data('shakespeare_test.h5', args)

        # split data into two parts: support client / test client
        support_train_str, support_test_str, test_train_str, test_test_str = split_data(
            d_train, d_test, args)

        # preprocess data and construct a dictionary for whole data
        corpus = Dictionary()
        support_train, support_test = lstm_data_process(
            support_train_str, support_test_str, corpus, args)
        test_train, test_test = lstm_data_process(test_train_str,
                                                  test_test_str, corpus, args)
        args.ntokens = len(corpus)
        args.corpus = corpus

        if args.algo == 'FL':
            FL_LSTM(support_train, support_test, test_train, test_test, args)
        elif args.algo == 'FR':
            FR_LSTM(support_train, support_test, test_train, test_test, args)
    else:
        # load MNIST data (client: 3383 / train: 341873 / test: 40832)
        d_train = load_data('fed_emnist_digitsonly_train.h5', args)
        d_test = load_data('fed_emnist_digitsonly_test.h5', args)