def load_data(config, dirname='../dataset/', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() gender_vocab = Counter() ######################################################### # Read names ######################################################### def read_data(filename='names.csv'): data = open(filename).readlines() samples = [] for datum in data: name = datum.split(',')[1] name = ''.join(name.split()) samples.append(remove_punct_symbols(name)) return samples def read_dirs(dirs=['boy', 'girl']): samples = [] for d in dirs: for filename in os.listdir('{}/{}'.format(dirname, d)): s = read_data('{}/{}/{}'.format(dirname, d, filename)) s = [(d, n) for n in s] samples.extend(s) return list(set(samples)) raw_samples = read_dirs() log.info('read {} names'.format(len(raw_samples))) ######################################################### # Read tamil words ######################################################### def read_words(filename=config.HPCONFIG.lm_dataset_path): samples = [] for line in tqdm( open(filename).readlines()[:config.HPCONFIG.lm_samples_count], 'reading lm file for words'): s = line.split() s = [('neutral', n) for n in s] samples.extend(s) return list(set(samples)) pretrain_samples = read_words() ######################################################### # build vocab ######################################################### all_samples = raw_samples + pretrain_samples log.info('building input_vocabulary...') for gender, name in tqdm(all_samples, desc='building vocab'): name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name): input_vocab.update(name) gender_vocab.update([gender]) vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50) print(gender_vocab) gender_vocab = Vocab(gender_vocab, special_tokens=[]) if config.CONFIG.write_vocab_to_file: vocab.write_to_file(config.ROOT_DIR + '/input_vocab.csv') gender_vocab.write_to_file(config.ROOT_DIR + '/gender_vocab.csv') def build_samples(raw_samples): samples = [] for i, (gender, name) in enumerate(tqdm(raw_samples, desc='processing names')): try: #name = remove_punct_symbols(name) name = tamil.utf8.get_letters(name.strip()) if len(name) < 2: continue log.debug('===') log.debug(pformat(name)) for a, b in zip(range(len(name)), range(1, len(name) - 1)): template = list(NULL_CHAR * len(name)) template[a] = name[a] template[b] = name[b] samples.append( Sample('{}.{}'.format(gender, i), gender, template, name)) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(name)) return samples pretrain_samples = build_samples(pretrain_samples) samples = build_samples(raw_samples) print('skipped {} samples'.format(skipped)) pivot = int(len(samples) * config.CONFIG.split_ratio) train_samples, test_samples = samples[:pivot], samples[pivot:] #train_samples, test_samples = samples, [] #train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True) return NameDataset('names', (train_samples, test_samples), pretrain_samples=pretrain_samples, input_vocab=vocab, gender_vocab=gender_vocab)
def load_data(config, filename='../dataset/lm_lengthsorted.txt', max_sample_size=None): samples = [] skipped = 0 input_vocab = Counter() output_vocab = Counter() bloom_filter = Counter() try: log.info('processing file: {}'.format(filename)) text_file = open(filename).readlines() log.info('building input_vocabulary...') sentences = set() for i, l in tqdm(enumerate(text_file[:config.HPCONFIG.max_samples]), desc='processing {}'.format(filename)): sentence = remove_punct_symbols(l) sentence = sentence.strip().split() if len(sentence): input_vocab.update(sentence) sentences.add(tuple(sentence)) freq_threshold = (config.HPCONFIG.freq_threshold * (float(config.HPCONFIG.max_samples) /len(text_file))) log.info('freq_threhold: {}'.format(freq_threshold)) vocab = Vocab(input_vocab, special_tokens = VOCAB, freq_threshold = int(freq_threshold)) if config.CONFIG.write_vocab_to_file: vocab.write_to_file(config.ROOT_DIR + '/vocab.csv') for i, sentence in tqdm(enumerate(sentences), desc='processing sentences'): if len(sentence) < 2: continue unk_ratio = float(count_UNKS(sentence, vocab))/len(sentence) log.debug('===') log.debug(pformat(sentence)) sentence = [i if vocab[i] != vocab['UNK'] else 'UNK' for i in sentence ] log.debug(pformat(sentence)) if unk_ratio > 0.7: log.debug('unk ratio is heavy: {}'.format(unk_ratio)) continue for center_word_pos, center_word in enumerate(sentence): for w in range(-config.HPCONFIG.window_size, config.HPCONFIG.window_size + 1): context_word_pos = center_word_pos + w # make soure not jump out sentence if (context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos): continue pair = (center_word, sentence[context_word_pos]) if pair[0] != 'UNK' and pair[1] != 'UNK': if not pair in bloom_filter: pass samples.append( Sample('{}.{}'.format(i, center_word_pos), #sentence, center_word, sentence[context_word_pos] ) ) bloom_filter.update([pair]) if max_sample_size and len(samples) > max_sample_size: break except: skipped += 1 log.exception('{}'.format(l)) print('skipped {} samples'.format(skipped)) if config.CONFIG.dump_bloom_filter: with open('word_pair.csv', 'w') as F: for k,v in bloom_filter.items(): F.write('|'.join(list(k) + [str(v)]) + '\n') #pivot = int(len(samples) * config.CONFIG.split_ratio) #train_samples, test_samples = samples[:pivot], samples[pivot:] train_samples, test_samples = samples, [] return Dataset(filename, (train_samples, test_samples), input_vocab = vocab, output_vocab = vocab)