def __init__(self, name, datasets, portion_percent=1.0, sort_key=None): super().__init__(name, datasets, portion_percent, sort_key) input_vocab = Counter() special_tokens = [] for dataset in self.datasets: input_vocab += dataset.input_vocab.freq_dict for token in dataset.input_vocab.special_tokens: if token not in special_tokens: special_tokens.append(token) self.input_vocab = Vocab(input_vocab, special_tokens) output_vocab = Counter() special_tokens = [] for dataset in self.datasets: output_vocab += dataset.output_vocab.freq_dict special_tokens.extend(dataset.output_vocab.special_tokens) self.output_vocab = Vocab(output_vocab, special_tokens) log.info('build dataset: {}'.format(name)) log.info(' trainset size: {}'.format(len(self.trainset))) log.info(' testset size: {}'.format(len(self.testset))) log.info(' input_vocab size: {}'.format(len(self.input_vocab))) log.info(' output_vocab size: {}'.format(len(self.output_vocab)))
def load_movie_sentiment_dataset(config, dataset_path = '../data/dataset/sentiment-analysis-movie-reviews/', max_sample_size=None): output_vocab = Counter() input_vocab = Counter() def load_data(set_='train'): skipped = 0 samples = [] for i, line in enumerate(tqdm( open( '{}/{}.tsv'.format(dataset_path, set_) ).readlines())): try: #print(line.split('\t')) pid, sid, line, label = line.strip().split('\t') samples.append( Sample( id = '{}.{}.{}.{}'.format(pid, sid, i, label), sequence = line, label = label, ) ) except KeyboardInterrupt: raise KeyboardInterrupt except: skipped += 1 log.exception(dataset_path) print('skipped {} samples'.format(skipped)) return samples samples_list = load_data() samples = defaultdict(list) train_samples, test_samples = {}, {} for s in samples_list: samples[s.label].append(s) for label in samples.keys(): pivot = int( len(samples[label]) * config.CONFIG.split_ratio ) train_samples [label] = samples[label][:pivot] test_samples [label] = samples[label][pivot:] samples = flatten_dictvalues(samples) output_vocab.update( [s.label for s in samples] ) [input_vocab.update(s.sequence) for s in samples] pprint([(k, output_vocab[k]) for k in sorted(output_vocab.keys())]) return ClasswiseDataset(config.HPCONFIG.dataset_name, (train_samples, test_samples), Vocab(input_vocab, freq_threshold=10), Vocab(output_vocab, special_tokens=[], freq_threshold=0))
def load_filmreviews_data(config, filename=('../dataset/filmreviews/reviews.subword_nmt.csv', '../dataset/filmreviews/ratings.csv'), max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() try: log.info('processing file: {}'.format(filename)) text_file, label_file = [open(f).readlines() for f in filename] for i, (s, l) in tqdm(enumerate(zip(text_file, label_file)), desc='processing {}'.format(filename)): s, l = s.strip(), l.strip() label = float(l.strip().lower()) if label >= 2.75: label = 'positive' else: label = 'negative' samples.append( Sample(i, s.strip().split(), label ) ) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(line)) print('skipped {} samples'.format(skipped)) if max_sample_size: samples = samples[:max_sample_size] log.info('building input_vocabulary...') for sample in samples: input_vocab.update(sample.sequence) output_vocab.update([sample.label]) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True) test_samples = sorted(test_samples, key=lambda x: len(x.sequence), reverse=True) return Dataset(filename, (train_samples, test_samples), Vocab(input_vocab, special_tokens=VOCAB), Vocab(output_vocab))
def load_task6_data(max_sample_size=None): task_name, train_samples, train_input_vocab, train_output_vocab = load_task_data( task=6, type_='train') task_name, test_samples, test_input_vocab, test_output_vocab = load_task_data( task=6, type_='test') input_vocab = train_input_vocab + test_input_vocab output_vocab = train_output_vocab + test_output_vocab return Dataset(task_name, (train_samples, test_samples), Vocab(input_vocab, special_tokens=VOCAB), Vocab(output_vocab))
def load_task1_task6_data(max_sample_size=None): trainset, testset = [], [] input_vocab, output_vocab = Counter(), Counter() for i in [1, 6]: task_name, train_samples, train_input_vocab, train_output_vocab = load_task_data( task=i, type_='train') task_name, test_samples, test_input_vocab, test_output_vocab = load_task_data( task=i, type_='test') trainset += train_samples testset += test_samples input_vocab += train_input_vocab + test_input_vocab output_vocab += train_output_vocab + test_output_vocab return Dataset(task_name, (trainset, testset), Vocab(input_vocab, special_tokens=VOCAB), Vocab(output_vocab))
def load_data(config, max_sample_size=None, char_level=True): dataset = {} #filename, samples, vocab = load_tawiki_data(config, char_level=char_level) filename, samples, vocab = load_tawiki_bpe_data(config) vocab = Vocab(vocab, special_tokens=VOCAB) pivot = int( config.CONFIG.split_ratio * len(samples)) train_samples, test_samples = samples[:pivot], samples[pivot:] dataset[filename] = Dataset(filename, (train_samples, test_samples), vocab, vocab) return DatasetList('ta-lm', dataset.values())
def load_data(config, max_sample_size=None): dataset = {} for i in config.HPCONFIG.tasks: filename, train_samples, train_input_vocab, train_output_vocab = load_task_data( config, task=i, type_='train', max_sample_size=max_sample_size) filename, test_samples, test_input_vocab, test_output_vocab = load_task_data( config, task=i, type_='test', max_sample_size=max_sample_size) task_name = re.search(r'qa\d+_(.*)_.*.txt', filename) if task_name: task_name = task_name.group(1) input_vocab = train_input_vocab + test_input_vocab output_vocab = train_output_vocab + test_output_vocab dataset[task_name] = Dataset(task_name, (train_samples, test_samples), Vocab(input_vocab, special_tokens=VOCAB), Vocab(output_vocab)) return DatasetList('babi', dataset.values())
def load_data(config, max_sample_size=None): dataset = {} filename, train_samples, vocab = load_tawiki_data(config) vocab = Vocab(vocab, special_tokens=VOCAB) pivot = int(config.CONFIG.split_ratio * len(train_samples)) dataset[filename] = Dataset(filename, (train_samples[:pivot], train_samples[pivot:]), vocab, vocab) return DatasetList('ta-lm', dataset.values())
def load_data(config, filename='../dataset/lm_lengthsorted.txt', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() try: log.info('processing file: {}'.format(filename)) text_file = open(filename).readlines()[:config.HPCONFIG.max_samples] for i, l in tqdm(enumerate(text_file), desc='processing {}'.format(filename)): sentence = l.strip().split() if len(sentence) > 3: samples.append( Sample(i, sentence[:-1], sentence[-1] ) ) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(line)) print('skipped {} samples'.format(skipped)) samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True) if max_sample_size: samples = samples[:max_sample_size] log.info('building input_vocabulary...') for sample in samples: input_vocab.update(sample.sequence + [sample.label]) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] vocab = Vocab(input_vocab, special_tokens=VOCAB) return Dataset(filename, (train_samples, test_samples), input_vocab = vocab, output_vocab = vocab)
def load_news_dataset(config, dataset_path = '../data/dataset/news/data.csv', max_sample_size=None): output_vocab = Counter() def load_all_data(): skipped = 0 samples = [] for i, line in enumerate(tqdm(open(dataset_path).readlines())): try: _, line, label, *__ = line.split('|') samples.append( Sample( id = '{}.{}'.format(label, i), sequence = line, label = label, ) ) except KeyboardInterrupt: raise KeyboardInterrupt except: skipped += 1 log.exception(dataset_path) print('skipped {} samples'.format(skipped)) return samples samples_list = load_all_data() samples = defaultdict(list) train_samples, test_samples = {}, {} for s in samples_list: samples[s.label].append(s) for label in samples.keys(): pivot = int( len(samples[label]) * config.CONFIG.split_ratio ) train_samples [label] = samples[label][:pivot] test_samples [label] = samples[label][pivot:] samples = flatten_dictvalues(samples) output_vocab.update( [s.label for s in samples] ) pprint([(k, output_vocab[k]) for k in sorted(output_vocab.keys())]) return ClasswiseDataset(config.HPCONFIG.dataset_name, (train_samples, test_samples), Vocab(output_vocab, special_tokens=[], freq_threshold=0))
def load_data(config, filename='../dataset/lm_lengthsorted.txt', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() try: log.info('processing file: {}'.format(filename)) text_file = open(filename).readlines()[:config.HPCONFIG.max_samples] for i, l in tqdm(enumerate(text_file), desc='processing {}'.format(filename)): orig_sentence = l.strip().split() if len(orig_sentence) > 20: continue #print('===========') grouped_token_sentence = [] token = [] token.append(orig_sentence[0]) i = 1 while i < len(orig_sentence): if orig_sentence[i]: token.append(orig_sentence[i]) #print(orig_sentence[i]) if orig_sentence[i].endswith('@@'): #print('endswith @@') pass else: #print('not endswith @@') if token: grouped_token_sentence.append(token) #print(token) #print(grouped_token_sentence) token = [] i += 1 if token: grouped_token_sentence.append(token) #print(grouped_token_sentence) sentence = grouped_token_sentence if len(sentence) < 3: continue for center_word_pos, center_word in enumerate(sentence): for w in range(-config.HPCONFIG.window_size, config.HPCONFIG.window_size + 1): context_word_pos = center_word_pos + w # make soure not jump out sentence if (context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos): continue samples.append( Sample( '{}.{}'.format(i, center_word_pos), orig_sentence, sentence, (center_word, sentence[context_word_pos]), True, max([ len(t) for t in (center_word, sentence[context_word_pos]) ]) #will be used in batchop for padding )) for w in range(0, config.HPCONFIG.window_size - 1): context_word_pos = center_word_pos - w # make soure not jump out sentence if (context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos): continue samples.append( Sample( '{}.{}'.format(i, center_word_pos), orig_sentence, sentence, (center_word, sentence[context_word_pos]), False, max([ len(t) for t in (center_word, sentence[context_word_pos]) ]))) for w in range(config.HPCONFIG.window_size + 1, len(sentence)): context_word_pos = center_word_pos + w # make soure not jump out sentence if (context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos): continue samples.append( Sample( '{}.{}'.format(i, center_word_pos), orig_sentence, sentence, (center_word, sentence[context_word_pos]), False, max([ len(t) for t in (center_word, sentence[context_word_pos]) ]))) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(l)) print('skipped {} samples'.format(skipped)) log.info('building input_vocabulary...') for sample in tqdm(samples): for tokens in sample.sentence: input_vocab.update(tokens) output_vocab.update([sample.existence]) #pivot = int(len(samples) * config.CONFIG.split_ratio) #train_samples, test_samples = samples[:pivot], samples[pivot:] train_samples, test_samples = samples, [] input_vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50) output_vocab = Vocab(output_vocab) return Dataset(filename, (train_samples, test_samples), input_vocab=input_vocab, output_vocab=output_vocab)
flush = False if flush: log.info('flushing...') ids = tuple((Sample._fields.index('squad_id'), )) dataset, vocabulary = load_squad_data('dataset/train-v1.1.json', ids) pickle.dump([dataset, dict(vocabulary)], open('train.squad', 'wb')) else: dataset, _vocabulary = pickle.load(open('train.squad', 'rb')) vocabulary = defaultdict(int) vocabulary.update(_vocabulary) log.info('dataset size: {}'.format(len(dataset))) log.info('dataset[:10]: {}'.format(pformat(dataset[0]))) log.info('vocabulary: {}'.format(len(vocabulary))) VOCAB = Vocab(vocabulary, VOCAB) if 'train' in sys.argv: labelled_samples = [ d for d in dataset[:10000] if len(d.a_positions) < 2 ] #[:100] pivot = int(Config().split_ratio * len(labelled_samples)) random.shuffle(labelled_samples) train_set, test_set = labelled_samples[:pivot], labelled_samples[ pivot:] train_set = sorted(train_set, key=lambda x: -len(x.context)) test_set = sorted(test_set, key=lambda x: -len(x.context)) exp_image = experiment(VOCAB, dataset, datapoints=[train_set, test_set])
vocabulary.update(_vocabulary), labels.update(_labels) log.info('trainset size: {}'.format(len(trainset))) log.info('trainset[:10]: {}'.format(pformat(trainset[0]))) pprint(labels) """ log.info('vocabulary: {}'.format( pformat( sorted( vocabulary.items(), key=lambda x: x[1], reverse=True) ))) """ log.info(pformat(labels)) VOCAB = Vocab(vocabulary, VOCAB) LABELS = Vocab(labels, tokens=LABELS) pprint(LABELS.index2word) try: model = BiLSTMModel(config, 'macnet', len(VOCAB), len(LABELS)) if config.CONFIG.cuda: model = model.cuda() model.load_state_dict( torch.load('{}/weights/{}.{}'.format(ROOT_DIR, SELF_NAME, 'pth'))) log.info('loaded the old image for the model') except: log.exception('failed to load the model') model.eval() print('**** the model', model, model.training)
def load_data_for_skipgram(config, filename='../dataset/lm_lengthsorted.txt', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() try: log.info('processing file: {}'.format(filename)) text_file = open(filename).readlines()[:config.HPCONFIG.max_samples] for i, l in tqdm(enumerate(text_file), desc='processing {}'.format(filename)): sentence = l.strip().split() window_size = 2 idx_pairs = [] # for each sentence indices = [word2idx[word] for word in sentence] # for each word, threated as center word for center_word_pos in range(len(indices)): # for each window position for w in range(-window_size, window_size + 1): context_word_pos = center_word_pos + w # make soure not jump out sentence if (context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos): continue context_word_idx = indices[context_word_pos] idx_pairs.append((indices[center_word_pos], context_word_idx)) if len(sentence) > 3: samples.append( Sample(i, sentence[:-1], sentence[-1] ) ) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(line)) print('skipped {} samples'.format(skipped)) samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True) if max_sample_size: samples = samples[:max_sample_size] log.info('building input_vocabulary...') for sample in samples: input_vocab.update(sample.sequence + [sample.label]) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] vocab = Vocab(input_vocab, special_tokens=VOCAB) return Dataset(filename, (train_samples, test_samples), input_vocab = vocab, output_vocab = vocab)
def load_data(config, dirname='../dataset/', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() gender_vocab = Counter() ######################################################### # Read names ######################################################### def read_data(filename='names.csv'): data = open(filename).readlines() samples = [] for datum in data: name = datum.split(',')[1] name = ''.join(name.split()) samples.append(remove_punct_symbols(name)) return samples def read_dirs(dirs=['boy', 'girl']): samples = [] for d in dirs: for filename in os.listdir('{}/{}'.format(dirname, d)): s = read_data('{}/{}/{}'.format(dirname, d, filename)) s = [(d, n) for n in s] samples.extend(s) return list(set(samples)) raw_samples = read_dirs() log.info('read {} names'.format(len(raw_samples))) ######################################################### # Read tamil words ######################################################### def read_words(filename=config.HPCONFIG.lm_dataset_path): samples = [] for line in tqdm( open(filename).readlines()[:config.HPCONFIG.lm_samples_count], 'reading lm file for words'): s = line.split() s = [('neutral', n) for n in s] samples.extend(s) return list(set(samples)) pretrain_samples = read_words() ######################################################### # build vocab ######################################################### all_samples = raw_samples + pretrain_samples log.info('building input_vocabulary...') for gender, name in tqdm(all_samples, desc='building vocab'): name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name): input_vocab.update(name) gender_vocab.update([gender]) vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50) print(gender_vocab) gender_vocab = Vocab(gender_vocab, special_tokens=[]) if config.CONFIG.write_vocab_to_file: vocab.write_to_file(config.ROOT_DIR + '/input_vocab.csv') gender_vocab.write_to_file(config.ROOT_DIR + '/gender_vocab.csv') def build_samples(raw_samples): samples = [] for i, (gender, name) in enumerate(tqdm(raw_samples, desc='processing names')): try: #name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name) < 2: continue log.debug('===') log.debug(pformat(name)) for a, b in zip(range(len(name)), range(1, len(name) - 1)): template = list(NULL_CHAR * len(name)) template[a] = name[a] template[b] = name[b] samples.append( Sample('{}.{}'.format(gender, i), gender, template, name)) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(name)) return samples pretrain_samples = build_samples(pretrain_samples) samples = build_samples(raw_samples) print('skipped {} samples'.format(skipped)) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] #train_samples, test_samples = samples, [] #train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True) return NameDataset('names', (train_samples, test_samples), pretrain_samples=pretrain_samples, input_vocab=vocab, gender_vocab=gender_vocab)
dataset, vocabulary = load_squad_data('dataset/train-v1.1.json', ids) pickle.dump([dataset, dict(vocabulary)], open('train.squad', 'wb')) else: dataset, _vocabulary = pickle.load(open('train.squad', 'rb')) vocabulary = defaultdict(int) vocabulary.update(_vocabulary) log.info('dataset size: {}'.format(len(dataset))) log.info('dataset[:10]: {}'.format(pformat(dataset[0]))) log.info('vocabulary: {}'.format( pformat( sorted( vocabulary.items(), key=lambda x: x[1], reverse=True) ))) VOCAB = Vocab(vocabulary, VOCAB, freq_threshold=100) pprint(VOCAB.word2index) if 'train' in sys.argv: labelled_samples = [d for d in dataset if len(d.a) > 0] #[:100] pivot = int( Config().split_ratio * len(labelled_samples) ) random.shuffle(labelled_samples) train_set, test_set = labelled_samples[:pivot], labelled_samples[pivot:] train_set = sorted(train_set, key=lambda x: -len(x.a + x.story)) test_set = sorted(test_set, key=lambda x: -len(x.a + x.story)) exp_image = experiment(VOCAB, dataset, datapoints=[train_set, test_set]) if 'predict' in sys.argv: model = BiLSTMDecoderModel(Config(), len(VOCAB), len(LABELS)) if Config().cuda: model = model.cuda() model.load_state_dict(torch.load('{}.{}'.format(SELF_NAME, '.pth')))
def load_data(config, filename='../dataset/lm_lengthsorted.txt', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() bloom_filter = Counter() try: log.info('processing file: {}'.format(filename)) text_file = open(filename).readlines() log.info('building input_vocabulary...') sentences = set() for i, l in tqdm(enumerate(text_file[:config.HPCONFIG.max_samples]), desc='processing {}'.format(filename)): sentence = remove_punct_symbols(l) sentence = sentence.strip().split() if len(sentence): input_vocab.update(sentence) sentences.add(tuple(sentence)) freq_threshold = (config.HPCONFIG.freq_threshold * (float(config.HPCONFIG.max_samples) /len(text_file))) log.info('freq_threhold: {}'.format(freq_threshold)) vocab = Vocab(input_vocab, special_tokens = VOCAB, freq_threshold = int(freq_threshold)) if config.CONFIG.write_vocab_to_file: vocab.write_to_file(config.ROOT_DIR + '/vocab.csv') for i, sentence in tqdm(enumerate(sentences), desc='processing sentences'): if len(sentence) < 2: continue unk_ratio = float(count_UNKS(sentence, vocab))/len(sentence) log.debug('===') log.debug(pformat(sentence)) sentence = [i if vocab[i] != vocab['UNK'] else 'UNK' for i in sentence ] log.debug(pformat(sentence)) if unk_ratio > 0.7: log.debug('unk ratio is heavy: {}'.format(unk_ratio)) continue for center_word_pos, center_word in enumerate(sentence): for w in range(-config.HPCONFIG.window_size, config.HPCONFIG.window_size + 1): context_word_pos = center_word_pos + w # make soure not jump out sentence if (context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos): continue pair = (center_word, sentence[context_word_pos]) if pair[0] != 'UNK' and pair[1] != 'UNK': if not pair in bloom_filter: pass samples.append( Sample('{}.{}'.format(i, center_word_pos), #sentence, center_word, sentence[context_word_pos] ) ) bloom_filter.update([pair]) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(l)) print('skipped {} samples'.format(skipped)) if config.CONFIG.dump_bloom_filter: with open('word_pair.csv', 'w') as F: for k,v in bloom_filter.items(): F.write('|'.join(list(k) + [str(v)]) + '\n') #pivot = int(len(samples) * config.CONFIG.split_ratio) #train_samples, test_samples = samples[:pivot], samples[pivot:] train_samples, test_samples = samples, [] return Dataset(filename, (train_samples, test_samples), input_vocab = vocab, output_vocab = vocab)