def create_vocab(filename):
    char_vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        for word in line.split(' '):
            for char in word:
                char_vocab[char] += 1
    char_vocab = sorted(char_vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.vocab_file,
                          ['{}:{}'.format(w, n) for w, n in char_vocab])
Example #2
0
def create_question_vocab(filename):
    vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        sample = json.loads(line)
        question = sample['segmented_question']
        for word in question:
            vocab[word] += 1
    vocab = sorted(vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.question_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in vocab])
Example #3
0
 def __init__(self):
     self.w2i, self.i2w, self.wi2n = load_vocab(config.word_vocab_file, config.word_vocab_size)
     self.c2i, self.i2c, self.ci2n = load_vocab(config.char_vocab_file, config.char_vocab_size)
     self.words = list(self.w2i.keys())
     self.chars = list(self.c2i.keys())
     self.char_weights = [self.ci2n[id] for id in range(len(self.chars))]
     self.norm_char_weights = self.char_weights / np.sum(self.char_weights)
     self.train_set = load_qa(config.train_file, config.answer_limit)
     self.dev_set = load_qa(config.dev_file, config.answer_limit)
     self.stopwords = set(utils.read_all_lines(config.stopwords_file))
Example #4
0
    def __init__(self, opt):
        self.types = {}
        self.train_set, self.dev_set, self.test_set = [], [], []
        for train_file, test_file, tid, type in babi.enumerate_dataset(
                opt.babi_en_folder):
            train_set = babi.parse_stories(utils.read_all_lines(train_file))
            test_set = babi.parse_stories(utils.read_all_lines(test_file))
            train_size = len(train_set) * 9 // 10
            train_set, dev_set = train_set[:train_size], train_set[train_size:]
            self.train_set += self.add_type(train_set, tid)
            self.dev_set += self.add_type(dev_set, tid)
            self.test_set += self.add_type(test_set, tid)
            self.types[tid] = type
        data = self.train_set + self.dev_set + self.test_set
        vocab = sorted(
            reduce(lambda x, y: x | y,
                   (set(list(chain.from_iterable(s)) + q + a)
                    for s, q, a, _ in data)))
        word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
        word_idx['<PAD>'] = 0
        max_story_size = max(map(len, (s for s, _, _, _ in data)))
        #mean_story_size = int(np.mean([ len(s) for s, _, _ in data]))
        sentence_size = max(
            map(len, chain.from_iterable(s for s, _, _, _ in data)))
        query_size = max(map(len, (q for _, q, _, _ in data)))
        memory_size = min(opt.memory_size, max_story_size)
        print(
            f'memory size: {memory_size}, sentence size: {sentence_size}, query size: {query_size}'
        )
        # Add time words/indexes
        for i in range(memory_size):
            word_idx['time{}'.format(i + 1)] = len(word_idx)

        vocab_size = len(word_idx)
        sentence_size = max(query_size, sentence_size)  # for the position
        sentence_size += 1  # +1 for time words

        self.sentence_size = sentence_size
        self.vocab_size = vocab_size
        self.word_idx = word_idx
        self.memory_size = memory_size
        self.i2w = {k: v for v, k in self.word_idx.items()}
Example #5
0
def create_answer_vocab(filename):
    vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        sample = json.loads(line)
        for doc in sample['documents']:
            for answer in doc['segmented_paragraphs']:
                for word in answer:
                    vocab[word] += 1
    vocab = sorted(vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.answer_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in vocab])
def prepare_dataset_with_question_answers(source, target):
    lines = []
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        question = sample['question']
        for answer in sample['answers']:
            if len(answer) > len(question) * 2 and len(answer) >= 20:
                lines.append(answer)
                lines.append(question)
                lines.append('<P>')
    utils.write_all_lines(target, lines)
Example #7
0
def load_qa(filename, answer_limit=0):
    q = None
    qas = []
    for line in utils.read_all_lines(filename):
        if q is not None:
            answer = line.split(' ')
            if answer_limit == 0 or len(answer) <= answer_limit:
                qas.append((q.split(' '), answer))
            q = None
        else:
            q = line
    return qas
Example #8
0
def load_vocab(filename, count):
    w2i = {NULL: NULL_ID, OOV: OOV_ID, SOS: SOS_ID, EOS: EOS_ID}
    i2c = {NULL_ID: 0, SOS_ID: 0, EOS_ID: 0}
    all_entries = list(utils.read_all_lines(filename))
    count -= len(w2i)
    count = min(count, len(all_entries))
    for line in all_entries[:count]:
        word, freq = line.rsplit(':', 1)
        id = len(w2i)
        w2i[word] = id
        i2c[id] = int(freq)
    i2w = {k: v for v, k in w2i.items()}
    i2c[OOV_ID] = len(all_entries) - count
    return w2i, i2w, i2c
Example #9
0
def load_qa(filename, answer_limit=0):
    lines = []
    r = []
    for line in utils.read_all_lines(filename):
        if line == '<P>':
            passage = lines[0].split(' ')
            if len(''.join(passage)) <= config.max_passage_len:
                questions = [q.split(' ') for q in lines[1:] if len(q.replace(' ', '')) <= config.max_question_len]
                if questions:
                    r.append((passage, questions))
            lines.clear()
        else:
            lines.append(line)
    return r
def prepare_dataset_with_document(source, target):
    lines = []
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        documents = [doc for doc in sample['documents'] if doc['is_selected']]
        questions = [doc['title'] for doc in documents]
        para_indices = [doc['most_related_para'] for doc in documents]
        answers = [
            doc['paragraphs'][k] for doc, k in zip(documents, para_indices)
        ]
        for q, a in zip(questions, answers):
            lines.append(rip_marks(a))
            lines.append(rip_marks(q))
            lines.append('<P>')
    utils.write_all_lines(target, lines)
Example #11
0
def load_vocab(filename, count):
    w2i = {
        config.NULL: config.NULL_ID,
        config.OOV: config.OOV_ID,
        config.SOS: config.SOS_ID,
        config.EOS: config.EOS_ID
    }
    count -= len(w2i)
    i2c = {}
    all_entries = list(utils.read_all_lines(filename))
    for line in all_entries[:count]:
        word, freq = line.rsplit(':', 1)
        id = len(w2i)
        w2i[word] = id
        i2c[id] = freq
    i2w = {k: v for v, k in w2i.items()}
    i2c[config.OOV_ID] = len(all_entries) - count
    return w2i, i2w, i2c
Example #12
0
def load_qa(filename):
    lines = []
    r = []
    skipped = 0
    for line in utils.read_all_lines(filename):
        if line == '<P>':
            passage = lines[0].replace(' ', '')
            if config.min_limit <= len(passage) <= config.max_limit:
                questions = [process_question(q) for q in lines[1:]]
                if questions:
                    r.append((passage, questions))
            else:
                skipped += 1
            lines.clear()
        else:
            lines.append(line)
    print('skipped {} records in {}'.format(skipped, filename))
    return r
Example #13
0
def analyze(source_path, test_class):
    all_lines = read_all_lines(source_path)
    paths = prefix_string_each(constants.DB_PATH, all_lines)
    paths = paths[int(len(paths)*0.7):len(paths)]  # Get only 30% of data for testing
    correct_count = 0
    for path in paths:
        files = read_all_files(path, '*.png')
        neutral = files[0]
        apex = files[len(files)-1]
        classification = classify(apex, neutral)
        classification_class = classification["class"]
        print(neutral)
        print(apex)
        print(classification_class)
        print('-------------------')
        if classification_class == test_class:
            correct_count += 1
    print('Total count: ', len(paths))
    print('Correct count for ' + test_class + ' is: ' + str(correct_count))
    print('% correct: '+str(correct_count/len(paths)))
Example #14
0
def prepare_dataset_with_document(source, target):
    lines = []
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        documents = sample['documents']
        questions = [sample['segmented_question']
                     ] + [doc['segmented_title'] for doc in documents]
        question_words = set(questions[0]) - stop_words
        questions = [' '.join(question) for question in questions]
        for doc in documents:
            for passage in doc['segmented_paragraphs']:
                passage_words = set(passage) - stop_words
                common = question_words & passage_words
                passage = rip_marks(' '.join(passage))
                if len(common) / len(question_words) > 0.3 and len(
                        passage) > 2 * len(questions[0]):
                    lines.append(passage)
                    lines += list(set(questions))
                    lines.append('<P>')
    utils.write_all_lines(target, lines)
Example #15
0
def prepare_dataset_with_document(source, target):
    aqs = []
    all = 0
    for line in utils.read_all_lines(source):
        sample = json.loads(line)
        question = sample['segmented_question']
        question_words = set(question) - stop_words
        for doc in sample['documents']:
            for answer in doc['segmented_paragraphs']:
                answer_words = set(answer) - stop_words
                common = question_words & answer_words
                if len(common) / len(question_words) > 0.3:
                    a = rip_marks(str.join(' ', answer))
                    q = rip_marks(str.join(' ', question))
                    if len(a) > 2 * len(q):
                        aqs.append((a, q))
                all += 1
    print('{}: {}/{} preprocessed'.format(source, len(aqs), all))
    #utils.save_json(target, [{'q': q, 'a': a} for a,q in aqs])
    utils.write_all_lines(target, ['{}\n{}\n'.format(q, a) for a, q in aqs])
    return aqs
Example #16
0
 def __init__(self, opt):
     self.output_file = opt.summary_file
     self.lines = list(utils.read_all_lines(self.output_file))
Example #17
0
import utils

lines = list(utils.read_all_lines('./eval.csv'))[100:200]

source = []
target = []

for line in lines:
    line = line.split('$')
    if len(source) == len(target):
        source.append(line)
    else:
        target.append(line)

assert len(source) == len(target)

lines = []
for s, t in zip(source, target):
    assert len(s) == len(t)
    line = ''
    for x, y in zip(s, t):
        if y.startswith('S-'):
            line += f'<{y[2:]}>{x}</{y[2:]}>'
        elif y.startswith('B-'):
            line += f'<{y[2:]}>{x}'
        elif y.startswith('E-'):
            line += f'{x}</{y[2:]}>'
        else:
            line += x
    lines.append(line)
Example #18
0
import utils
import re

lines = []
for line in utils.read_all_lines('eval.processed.txt'):
    source = []
    target = []

    def process_others(start, end):
        for c in line[start:end]:
            source.append(c)
            target.append('O')

    def append_source(span):
        part = line[span[0]:span[1]]
        for c in part:
            source.append(c)

    def append_target(span, source_span):
        slen = source_span[1] - source_span[0]
        tag = line[span[0]:span[1]].upper()
        global target
        if slen == 1:
            target.append('S-' + tag)
        else:
            target += ['S-' + tag] + ['M-' + tag] * (slen - 2) + ['E-' + tag]

    def join(tp):
        return '$'.join(tp)

    last_pos = 0
Example #19
0
import json
import os
import utils
import config
import re
import data
from collections import defaultdict

stop_words = set(utils.read_all_lines(config.stopwords_file))


def create_question_vocab(filename):
    vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        sample = json.loads(line)
        question = sample['segmented_question']
        for word in question:
            vocab[word] += 1
    vocab = sorted(vocab.items(), key=lambda x: -x[1])
    utils.write_all_lines(config.question_vocab_file,
                          ['{}:{}'.format(w, c) for w, c in vocab])


def create_answer_vocab(filename):
    vocab = defaultdict(lambda: 0)
    for line in utils.read_all_lines(filename):
        sample = json.loads(line)
        for doc in sample['documents']:
            for answer in doc['segmented_paragraphs']:
                for word in answer:
                    vocab[word] += 1