Python Vocab 예제들, anikattu.vocab.Vocab Python 예제들

예제 #1

0

파일 보기

    def __init__(self, name, datasets, portion_percent=1.0, sort_key=None):
        super().__init__(name, datasets, portion_percent, sort_key)

        input_vocab = Counter()
        special_tokens = []
        for dataset in self.datasets:
            input_vocab += dataset.input_vocab.freq_dict

            for token in dataset.input_vocab.special_tokens:
                if token not in special_tokens:
                    special_tokens.append(token)

        self.input_vocab = Vocab(input_vocab, special_tokens)

        output_vocab = Counter()
        special_tokens = []
        for dataset in self.datasets:
            output_vocab += dataset.output_vocab.freq_dict
            special_tokens.extend(dataset.output_vocab.special_tokens)

        self.output_vocab = Vocab(output_vocab, special_tokens)

        log.info('build dataset: {}'.format(name))
        log.info(' trainset size: {}'.format(len(self.trainset)))
        log.info(' testset size: {}'.format(len(self.testset)))
        log.info(' input_vocab size: {}'.format(len(self.input_vocab)))
        log.info(' output_vocab size: {}'.format(len(self.output_vocab)))

예제 #2

0

파일 보기

def load_movie_sentiment_dataset(config,
                           dataset_path = '../data/dataset/sentiment-analysis-movie-reviews/',
                           max_sample_size=None):

    output_vocab = Counter()
    input_vocab = Counter()
    
    def load_data(set_='train'):
        skipped = 0
        samples = []

        for i, line in enumerate(tqdm(
                open(
                    '{}/{}.tsv'.format(dataset_path, set_)
                ).readlines())):
            
            try:
                #print(line.split('\t'))
                pid, sid, line, label = line.strip().split('\t')
                samples.append(
                    Sample(
                    id = '{}.{}.{}.{}'.format(pid, sid, i, label),
                        sequence = line,
                        label    = label,
                    )
                )

            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except:
                skipped += 1
                log.exception(dataset_path)
                            
        print('skipped {} samples'.format(skipped))
        return samples

    samples_list = load_data()
    samples = defaultdict(list)
    train_samples, test_samples = {}, {}
    
    for s in samples_list:
        samples[s.label].append(s)

    for label in samples.keys():
        pivot = int( len(samples[label]) * config.CONFIG.split_ratio )
        train_samples [label] = samples[label][:pivot]
        test_samples  [label] = samples[label][pivot:]

    samples = flatten_dictvalues(samples)
        
    output_vocab.update( [s.label for s in samples]  )

    [input_vocab.update(s.sequence) for s in samples]

    pprint([(k, output_vocab[k]) for k in sorted(output_vocab.keys())])

    return ClasswiseDataset(config.HPCONFIG.dataset_name,
                            (train_samples, test_samples),
                            Vocab(input_vocab, freq_threshold=10),
                            Vocab(output_vocab, special_tokens=[], freq_threshold=0))

예제 #3

0

파일 보기

def load_filmreviews_data(config,
                          filename=('../dataset/filmreviews/reviews.subword_nmt.csv',
                                    '../dataset/filmreviews/ratings.csv'),
                          max_sample_size=None):
    
    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()
    
    try:
        log.info('processing file: {}'.format(filename))
        text_file, label_file = [open(f).readlines() for f in filename]
        for i, (s, l) in tqdm(enumerate(zip(text_file, label_file)),
                            desc='processing {}'.format(filename)):

            s, l = s.strip(), l.strip()
            label = float(l.strip().lower())
            if label >= 2.75:
                label = 'positive'
            else:
                label = 'negative'
            samples.append(
                Sample(i,
                       s.strip().split(),
                       label
                )
            )
            
            
            if  max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(line))

    print('skipped {} samples'.format(skipped))
    

    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building input_vocabulary...')
    for sample in samples:
        input_vocab.update(sample.sequence)            
        output_vocab.update([sample.label])

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True)
    test_samples  = sorted(test_samples, key=lambda x: len(x.sequence), reverse=True)
    return Dataset(filename,
                   (train_samples, test_samples),
                   Vocab(input_vocab, special_tokens=VOCAB),
                   Vocab(output_vocab))

예제 #4

0

파일 보기

파일: utilz.py 프로젝트: tnaiguild-meetup/intro-attn

def load_task6_data(max_sample_size=None):
    task_name, train_samples, train_input_vocab, train_output_vocab = load_task_data(
        task=6, type_='train')
    task_name, test_samples, test_input_vocab, test_output_vocab = load_task_data(
        task=6, type_='test')

    input_vocab = train_input_vocab + test_input_vocab
    output_vocab = train_output_vocab + test_output_vocab

    return Dataset(task_name, (train_samples, test_samples),
                   Vocab(input_vocab, special_tokens=VOCAB),
                   Vocab(output_vocab))

예제 #5

0

파일 보기

파일: utilz.py 프로젝트: tnaiguild-meetup/intro-attn

def load_task1_task6_data(max_sample_size=None):
    trainset, testset = [], []
    input_vocab, output_vocab = Counter(), Counter()
    for i in [1, 6]:
        task_name, train_samples, train_input_vocab, train_output_vocab = load_task_data(
            task=i, type_='train')
        task_name, test_samples, test_input_vocab, test_output_vocab = load_task_data(
            task=i, type_='test')

        trainset += train_samples
        testset += test_samples
        input_vocab += train_input_vocab + test_input_vocab
        output_vocab += train_output_vocab + test_output_vocab

    return Dataset(task_name, (trainset, testset),
                   Vocab(input_vocab, special_tokens=VOCAB),
                   Vocab(output_vocab))

예제 #6

0

파일 보기

파일: utilz.py 프로젝트: vanangamudi/tamil-lm

def load_data(config, max_sample_size=None, char_level=True):
    dataset = {}
    #filename, samples, vocab = load_tawiki_data(config, char_level=char_level)
    filename, samples, vocab = load_tawiki_bpe_data(config)
    vocab = Vocab(vocab, special_tokens=VOCAB)
    pivot = int( config.CONFIG.split_ratio * len(samples))
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    dataset[filename] = Dataset(filename, (train_samples, test_samples), vocab, vocab)

    return DatasetList('ta-lm', dataset.values())

예제 #7

0

파일 보기

def load_data(config, max_sample_size=None):
    dataset = {}
    for i in config.HPCONFIG.tasks:
        filename, train_samples, train_input_vocab, train_output_vocab = load_task_data(
            config, task=i, type_='train', max_sample_size=max_sample_size)
        filename, test_samples, test_input_vocab, test_output_vocab = load_task_data(
            config, task=i, type_='test', max_sample_size=max_sample_size)
        task_name = re.search(r'qa\d+_(.*)_.*.txt', filename)

        if task_name:
            task_name = task_name.group(1)

        input_vocab = train_input_vocab + test_input_vocab
        output_vocab = train_output_vocab + test_output_vocab
        dataset[task_name] = Dataset(task_name, (train_samples, test_samples),
                                     Vocab(input_vocab, special_tokens=VOCAB),
                                     Vocab(output_vocab))

    return DatasetList('babi', dataset.values())

예제 #8

0

파일 보기

def load_data(config, max_sample_size=None):
    dataset = {}
    filename, train_samples, vocab = load_tawiki_data(config)
    vocab = Vocab(vocab, special_tokens=VOCAB)

    pivot = int(config.CONFIG.split_ratio * len(train_samples))
    
    dataset[filename] = Dataset(filename, (train_samples[:pivot], train_samples[pivot:]), vocab, vocab)

    return DatasetList('ta-lm', dataset.values())

예제 #9

0

파일 보기

def load_data(config,
               filename='../dataset/lm_lengthsorted.txt',
               max_sample_size=None):
    
    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()
    
    try:
        log.info('processing file: {}'.format(filename))
        text_file = open(filename).readlines()[:config.HPCONFIG.max_samples]
        for i, l in tqdm(enumerate(text_file),
                            desc='processing {}'.format(filename)):

            sentence = l.strip().split()

            if len(sentence) > 3:
                samples.append(
                    Sample(i,
                           sentence[:-1],
                           sentence[-1]
                    )
                )
            
            if  max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(line))

    print('skipped {} samples'.format(skipped))
    
    samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True)
    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building input_vocabulary...')
    for sample in samples:
        input_vocab.update(sample.sequence + [sample.label])            

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    vocab = Vocab(input_vocab, special_tokens=VOCAB)
    return Dataset(filename,
                   (train_samples, test_samples),
                   input_vocab = vocab,
                   output_vocab = vocab)

예제 #10

0

파일 보기

def load_news_dataset(config,
                           dataset_path = '../data/dataset/news/data.csv',
                           max_sample_size=None):

    output_vocab = Counter()
    
    def load_all_data():
        skipped = 0
        samples = []

        for i, line in enumerate(tqdm(open(dataset_path).readlines())):
            try:
                _, line, label, *__ = line.split('|')
                samples.append(
                    Sample(
                    id = '{}.{}'.format(label, i),
                        sequence = line,
                        label    = label,
                    )
                )

            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except:
                skipped += 1
                log.exception(dataset_path)
                            
        print('skipped {} samples'.format(skipped))
        return samples

    samples_list = load_all_data()
    samples = defaultdict(list)
    train_samples, test_samples = {}, {}
    
    for s in samples_list:
        samples[s.label].append(s)

    for label in samples.keys():
        pivot = int( len(samples[label]) * config.CONFIG.split_ratio )
        train_samples [label] = samples[label][:pivot]
        test_samples  [label] = samples[label][pivot:]

    samples = flatten_dictvalues(samples)
        
    output_vocab.update( [s.label for s in samples]  )
    pprint([(k, output_vocab[k]) for k in sorted(output_vocab.keys())])

    return ClasswiseDataset(config.HPCONFIG.dataset_name,
                   (train_samples, test_samples),
                   Vocab(output_vocab, special_tokens=[], freq_threshold=0))

예제 #11

0

파일 보기

def load_data(config,
              filename='../dataset/lm_lengthsorted.txt',
              max_sample_size=None):

    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()

    try:
        log.info('processing file: {}'.format(filename))
        text_file = open(filename).readlines()[:config.HPCONFIG.max_samples]
        for i, l in tqdm(enumerate(text_file),
                         desc='processing {}'.format(filename)):

            orig_sentence = l.strip().split()
            if len(orig_sentence) > 20:
                continue

            #print('===========')
            grouped_token_sentence = []
            token = []
            token.append(orig_sentence[0])
            i = 1
            while i < len(orig_sentence):

                if orig_sentence[i]:
                    token.append(orig_sentence[i])

                #print(orig_sentence[i])
                if orig_sentence[i].endswith('@@'):
                    #print('endswith @@')
                    pass

                else:
                    #print('not endswith @@')
                    if token:
                        grouped_token_sentence.append(token)
                    #print(token)
                    #print(grouped_token_sentence)
                    token = []

                i += 1

            if token:
                grouped_token_sentence.append(token)
                #print(grouped_token_sentence)

            sentence = grouped_token_sentence
            if len(sentence) < 3:
                continue

            for center_word_pos, center_word in enumerate(sentence):
                for w in range(-config.HPCONFIG.window_size,
                               config.HPCONFIG.window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if (context_word_pos < 0
                            or context_word_pos >= len(sentence)
                            or center_word_pos == context_word_pos):
                        continue

                    samples.append(
                        Sample(
                            '{}.{}'.format(i, center_word_pos),
                            orig_sentence,
                            sentence,
                            (center_word, sentence[context_word_pos]),
                            True,
                            max([
                                len(t) for t in (center_word,
                                                 sentence[context_word_pos])
                            ])  #will be used in batchop for padding
                        ))

                for w in range(0, config.HPCONFIG.window_size - 1):
                    context_word_pos = center_word_pos - w
                    # make soure not jump out sentence
                    if (context_word_pos < 0
                            or context_word_pos >= len(sentence)
                            or center_word_pos == context_word_pos):
                        continue

                    samples.append(
                        Sample(
                            '{}.{}'.format(i, center_word_pos), orig_sentence,
                            sentence,
                            (center_word, sentence[context_word_pos]), False,
                            max([
                                len(t) for t in (center_word,
                                                 sentence[context_word_pos])
                            ])))

                for w in range(config.HPCONFIG.window_size + 1, len(sentence)):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if (context_word_pos < 0
                            or context_word_pos >= len(sentence)
                            or center_word_pos == context_word_pos):
                        continue

                    samples.append(
                        Sample(
                            '{}.{}'.format(i, center_word_pos), orig_sentence,
                            sentence,
                            (center_word, sentence[context_word_pos]), False,
                            max([
                                len(t) for t in (center_word,
                                                 sentence[context_word_pos])
                            ])))

            if max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(l))

    print('skipped {} samples'.format(skipped))

    log.info('building input_vocabulary...')
    for sample in tqdm(samples):
        for tokens in sample.sentence:
            input_vocab.update(tokens)

        output_vocab.update([sample.existence])

    #pivot = int(len(samples) * config.CONFIG.split_ratio)
    #train_samples, test_samples = samples[:pivot], samples[pivot:]
    train_samples, test_samples = samples, []
    input_vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50)
    output_vocab = Vocab(output_vocab)
    return Dataset(filename, (train_samples, test_samples),
                   input_vocab=input_vocab,
                   output_vocab=output_vocab)

예제 #12

0

파일 보기

    flush = False
    if flush:
        log.info('flushing...')
        ids = tuple((Sample._fields.index('squad_id'), ))
        dataset, vocabulary = load_squad_data('dataset/train-v1.1.json', ids)
        pickle.dump([dataset, dict(vocabulary)], open('train.squad', 'wb'))
    else:
        dataset, _vocabulary = pickle.load(open('train.squad', 'rb'))
        vocabulary = defaultdict(int)
        vocabulary.update(_vocabulary)

    log.info('dataset size: {}'.format(len(dataset)))
    log.info('dataset[:10]: {}'.format(pformat(dataset[0])))
    log.info('vocabulary: {}'.format(len(vocabulary)))

    VOCAB = Vocab(vocabulary, VOCAB)
    if 'train' in sys.argv:
        labelled_samples = [
            d for d in dataset[:10000] if len(d.a_positions) < 2
        ]  #[:100]
        pivot = int(Config().split_ratio * len(labelled_samples))
        random.shuffle(labelled_samples)
        train_set, test_set = labelled_samples[:pivot], labelled_samples[
            pivot:]

        train_set = sorted(train_set, key=lambda x: -len(x.context))
        test_set = sorted(test_set, key=lambda x: -len(x.context))
        exp_image = experiment(VOCAB,
                               dataset,
                               datapoints=[train_set, test_set])

예제 #13

0

파일 보기

파일: nlp_template.py 프로젝트: tnaiguild-meetup/intro-attn

        vocabulary.update(_vocabulary), labels.update(_labels)

    log.info('trainset size: {}'.format(len(trainset)))
    log.info('trainset[:10]: {}'.format(pformat(trainset[0])))

    pprint(labels)
    """
    log.info('vocabulary: {}'.format(
        pformat(
            sorted(
                vocabulary.items(), key=lambda x: x[1], reverse=True)
        )))
    """

    log.info(pformat(labels))
    VOCAB = Vocab(vocabulary, VOCAB)
    LABELS = Vocab(labels, tokens=LABELS)
    pprint(LABELS.index2word)

    try:
        model = BiLSTMModel(config, 'macnet', len(VOCAB), len(LABELS))
        if config.CONFIG.cuda: model = model.cuda()
        model.load_state_dict(
            torch.load('{}/weights/{}.{}'.format(ROOT_DIR, SELF_NAME, 'pth')))
        log.info('loaded the old image for the model')
    except:
        log.exception('failed to load the model')

    model.eval()
    print('**** the model', model, model.training)

예제 #14

0

파일 보기

def load_data_for_skipgram(config,
               filename='../dataset/lm_lengthsorted.txt',
               max_sample_size=None):
    
    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()
    
    try:
        log.info('processing file: {}'.format(filename))
        text_file = open(filename).readlines()[:config.HPCONFIG.max_samples]
        for i, l in tqdm(enumerate(text_file),
                            desc='processing {}'.format(filename)):

            sentence = l.strip().split()
            window_size = 2
            idx_pairs = []
            # for each sentence

            indices = [word2idx[word] for word in sentence]
            # for each word, threated as center word
            for center_word_pos in range(len(indices)):
                # for each window position
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if (context_word_pos < 0
                        or context_word_pos >= len(indices)
                        or center_word_pos == context_word_pos):
                        continue
                    context_word_idx = indices[context_word_pos]
                    idx_pairs.append((indices[center_word_pos], context_word_idx))

            if len(sentence) > 3:
                samples.append(
                    Sample(i,
                           sentence[:-1],
                           sentence[-1]
                    )
                )
            
            if  max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(line))

    print('skipped {} samples'.format(skipped))
    
    samples = sorted(samples, key=lambda x: len(x.sequence), reverse=True)
    if max_sample_size:
        samples = samples[:max_sample_size]

    log.info('building input_vocabulary...')
    for sample in samples:
        input_vocab.update(sample.sequence + [sample.label])            

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    vocab = Vocab(input_vocab, special_tokens=VOCAB)
    return Dataset(filename,
                   (train_samples, test_samples),
                   input_vocab = vocab,
                   output_vocab = vocab)

예제 #15

0

파일 보기

파일: data_utilz.py 프로젝트: indicnlp/tamil-name-gen

def load_data(config, dirname='../dataset/', max_sample_size=None):

    samples = []
    skipped = 0

    input_vocab = Counter()
    gender_vocab = Counter()

    #########################################################
    # Read names
    #########################################################
    def read_data(filename='names.csv'):
        data = open(filename).readlines()
        samples = []
        for datum in data:
            name = datum.split(',')[1]
            name = ''.join(name.split())
            samples.append(remove_punct_symbols(name))

        return samples

    def read_dirs(dirs=['boy', 'girl']):
        samples = []
        for d in dirs:
            for filename in os.listdir('{}/{}'.format(dirname, d)):
                s = read_data('{}/{}/{}'.format(dirname, d, filename))
                s = [(d, n) for n in s]
                samples.extend(s)

        return list(set(samples))

    raw_samples = read_dirs()
    log.info('read {} names'.format(len(raw_samples)))

    #########################################################
    # Read tamil words
    #########################################################
    def read_words(filename=config.HPCONFIG.lm_dataset_path):
        samples = []
        for line in tqdm(
                open(filename).readlines()[:config.HPCONFIG.lm_samples_count],
                'reading lm file for words'):
            s = line.split()
            s = [('neutral', n) for n in s]
            samples.extend(s)

        return list(set(samples))

    pretrain_samples = read_words()

    #########################################################
    # build vocab
    #########################################################
    all_samples = raw_samples + pretrain_samples
    log.info('building input_vocabulary...')

    for gender, name in tqdm(all_samples, desc='building vocab'):
        name = remove_punct_symbols(name)
        name = tamil.utf8.get_letters(name.strip())
        if len(name):
            input_vocab.update(name)
            gender_vocab.update([gender])

    vocab = Vocab(input_vocab, special_tokens=VOCAB, freq_threshold=50)

    print(gender_vocab)
    gender_vocab = Vocab(gender_vocab, special_tokens=[])

    if config.CONFIG.write_vocab_to_file:
        vocab.write_to_file(config.ROOT_DIR + '/input_vocab.csv')
        gender_vocab.write_to_file(config.ROOT_DIR + '/gender_vocab.csv')

    def build_samples(raw_samples):
        samples = []
        for i, (gender,
                name) in enumerate(tqdm(raw_samples, desc='processing names')):
            try:

                #name = remove_punct_symbols(name)
                name = tamil.utf8.get_letters(name.strip())

                if len(name) < 2:
                    continue

                log.debug('===')
                log.debug(pformat(name))

                for a, b in zip(range(len(name)), range(1, len(name) - 1)):
                    template = list(NULL_CHAR * len(name))
                    template[a] = name[a]
                    template[b] = name[b]
                    samples.append(
                        Sample('{}.{}'.format(gender, i), gender, template,
                               name))

                if max_sample_size and len(samples) > max_sample_size:
                    break

            except:
                skipped += 1
                log.exception('{}'.format(name))

        return samples

    pretrain_samples = build_samples(pretrain_samples)
    samples = build_samples(raw_samples)
    print('skipped {} samples'.format(skipped))

    pivot = int(len(samples) * config.CONFIG.split_ratio)
    train_samples, test_samples = samples[:pivot], samples[pivot:]
    #train_samples, test_samples = samples, []

    #train_samples = sorted(train_samples, key=lambda x: len(x.sequence), reverse=True)

    return NameDataset('names', (train_samples, test_samples),
                       pretrain_samples=pretrain_samples,
                       input_vocab=vocab,
                       gender_vocab=gender_vocab)

예제 #16

0

파일 보기

     dataset, vocabulary = load_squad_data('dataset/train-v1.1.json', ids)
     pickle.dump([dataset, dict(vocabulary)], open('train.squad', 'wb'))
 else:
     dataset, _vocabulary = pickle.load(open('train.squad', 'rb'))
     vocabulary = defaultdict(int)
     vocabulary.update(_vocabulary)
     
 log.info('dataset size: {}'.format(len(dataset)))
 log.info('dataset[:10]: {}'.format(pformat(dataset[0])))
 log.info('vocabulary: {}'.format(
     pformat(
         sorted(
             vocabulary.items(), key=lambda x: x[1], reverse=True)
     )))
 
 VOCAB = Vocab(vocabulary, VOCAB, freq_threshold=100)
 pprint(VOCAB.word2index)
 if 'train' in sys.argv:
     labelled_samples = [d for d in dataset if len(d.a) > 0] #[:100]
     pivot = int( Config().split_ratio * len(labelled_samples) )
     random.shuffle(labelled_samples)
     train_set, test_set = labelled_samples[:pivot], labelled_samples[pivot:]
     
     train_set = sorted(train_set, key=lambda x: -len(x.a + x.story))
     test_set  = sorted(test_set, key=lambda x: -len(x.a + x.story))
     exp_image = experiment(VOCAB, dataset, datapoints=[train_set, test_set])
     
 if 'predict' in sys.argv:
     model =  BiLSTMDecoderModel(Config(), len(VOCAB),  len(LABELS))
     if Config().cuda:  model = model.cuda()
     model.load_state_dict(torch.load('{}.{}'.format(SELF_NAME, '.pth')))

예제 #17

0

파일 보기

파일: utilz.py 프로젝트: vanangamudi/tamil-lm2

def load_data(config,
              filename='../dataset/lm_lengthsorted.txt',
              max_sample_size=None):
    
    samples = []
    skipped = 0

    input_vocab = Counter()
    output_vocab = Counter()
    bloom_filter = Counter()
    try:
        log.info('processing file: {}'.format(filename))
        text_file = open(filename).readlines()
        
        log.info('building input_vocabulary...')
        sentences = set()
        for i, l in tqdm(enumerate(text_file[:config.HPCONFIG.max_samples]),
                            desc='processing {}'.format(filename)):

            sentence = remove_punct_symbols(l)
            sentence = sentence.strip().split()
            if len(sentence):
                input_vocab.update(sentence)
                sentences.add(tuple(sentence))

                
        freq_threshold = (config.HPCONFIG.freq_threshold * (float(config.HPCONFIG.max_samples)
                                                            /len(text_file)))
        log.info('freq_threhold: {}'.format(freq_threshold))
        vocab = Vocab(input_vocab,
                      special_tokens = VOCAB,
                      freq_threshold = int(freq_threshold))

        if config.CONFIG.write_vocab_to_file:
            vocab.write_to_file(config.ROOT_DIR + '/vocab.csv')
        
        for i, sentence in tqdm(enumerate(sentences),
                         desc='processing sentences'):

            if len(sentence) < 2:
                continue
            
            unk_ratio = float(count_UNKS(sentence, vocab))/len(sentence)

            log.debug('===')
            log.debug(pformat(sentence))
            
            sentence =  [i if vocab[i] != vocab['UNK'] else 'UNK' for i in sentence ]
            log.debug(pformat(sentence))

            if unk_ratio > 0.7:
                log.debug('unk ratio is heavy: {}'.format(unk_ratio))
                continue
                
            for center_word_pos, center_word in enumerate(sentence):
                for w in range(-config.HPCONFIG.window_size,
                                config.HPCONFIG.window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if (context_word_pos < 0
                        or context_word_pos >= len(sentence)
                        or center_word_pos == context_word_pos):
                        continue

                    pair = (center_word, sentence[context_word_pos])
                    if pair[0] != 'UNK' and pair[1] != 'UNK':
                        if not pair in bloom_filter:
                            pass
                            samples.append(
                                Sample('{}.{}'.format(i, center_word_pos),
                                       #sentence,
                                       center_word,
                                       sentence[context_word_pos]
                                )
                            )
                        bloom_filter.update([pair])
                        
            if  max_sample_size and len(samples) > max_sample_size:
                break

    except:
        skipped += 1
        log.exception('{}'.format(l))

    print('skipped {} samples'.format(skipped))

    if config.CONFIG.dump_bloom_filter:
        with open('word_pair.csv', 'w') as F:
            for k,v in bloom_filter.items():
                F.write('|'.join(list(k) + [str(v)]) + '\n')
                    
    #pivot = int(len(samples) * config.CONFIG.split_ratio)
    #train_samples, test_samples = samples[:pivot], samples[pivot:]
    train_samples, test_samples = samples, []

    return Dataset(filename,
                   (train_samples, test_samples),
                   input_vocab = vocab,
                   output_vocab = vocab)