def create_vocab(filename): char_vocab = defaultdict(lambda: 0) for line in utils.read_all_lines(filename): for word in line.split(' '): for char in word: char_vocab[char] += 1 char_vocab = sorted(char_vocab.items(), key=lambda x: -x[1]) utils.write_all_lines(config.vocab_file, ['{}:{}'.format(w, n) for w, n in char_vocab])
def create_question_vocab(filename): vocab = defaultdict(lambda: 0) for line in utils.read_all_lines(filename): sample = json.loads(line) question = sample['segmented_question'] for word in question: vocab[word] += 1 vocab = sorted(vocab.items(), key=lambda x: -x[1]) utils.write_all_lines(config.question_vocab_file, ['{}:{}'.format(w, c) for w, c in vocab])
def __init__(self): self.w2i, self.i2w, self.wi2n = load_vocab(config.word_vocab_file, config.word_vocab_size) self.c2i, self.i2c, self.ci2n = load_vocab(config.char_vocab_file, config.char_vocab_size) self.words = list(self.w2i.keys()) self.chars = list(self.c2i.keys()) self.char_weights = [self.ci2n[id] for id in range(len(self.chars))] self.norm_char_weights = self.char_weights / np.sum(self.char_weights) self.train_set = load_qa(config.train_file, config.answer_limit) self.dev_set = load_qa(config.dev_file, config.answer_limit) self.stopwords = set(utils.read_all_lines(config.stopwords_file))
def __init__(self, opt): self.types = {} self.train_set, self.dev_set, self.test_set = [], [], [] for train_file, test_file, tid, type in babi.enumerate_dataset( opt.babi_en_folder): train_set = babi.parse_stories(utils.read_all_lines(train_file)) test_set = babi.parse_stories(utils.read_all_lines(test_file)) train_size = len(train_set) * 9 // 10 train_set, dev_set = train_set[:train_size], train_set[train_size:] self.train_set += self.add_type(train_set, tid) self.dev_set += self.add_type(dev_set, tid) self.test_set += self.add_type(test_set, tid) self.types[tid] = type data = self.train_set + self.dev_set + self.test_set vocab = sorted( reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a, _ in data))) word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) word_idx['<PAD>'] = 0 max_story_size = max(map(len, (s for s, _, _, _ in data))) #mean_story_size = int(np.mean([ len(s) for s, _, _ in data])) sentence_size = max( map(len, chain.from_iterable(s for s, _, _, _ in data))) query_size = max(map(len, (q for _, q, _, _ in data))) memory_size = min(opt.memory_size, max_story_size) print( f'memory size: {memory_size}, sentence size: {sentence_size}, query size: {query_size}' ) # Add time words/indexes for i in range(memory_size): word_idx['time{}'.format(i + 1)] = len(word_idx) vocab_size = len(word_idx) sentence_size = max(query_size, sentence_size) # for the position sentence_size += 1 # +1 for time words self.sentence_size = sentence_size self.vocab_size = vocab_size self.word_idx = word_idx self.memory_size = memory_size self.i2w = {k: v for v, k in self.word_idx.items()}
def create_answer_vocab(filename): vocab = defaultdict(lambda: 0) for line in utils.read_all_lines(filename): sample = json.loads(line) for doc in sample['documents']: for answer in doc['segmented_paragraphs']: for word in answer: vocab[word] += 1 vocab = sorted(vocab.items(), key=lambda x: -x[1]) utils.write_all_lines(config.answer_vocab_file, ['{}:{}'.format(w, c) for w, c in vocab])
def prepare_dataset_with_question_answers(source, target): lines = [] for line in utils.read_all_lines(source): sample = json.loads(line) question = sample['question'] for answer in sample['answers']: if len(answer) > len(question) * 2 and len(answer) >= 20: lines.append(answer) lines.append(question) lines.append('<P>') utils.write_all_lines(target, lines)
def load_qa(filename, answer_limit=0): q = None qas = [] for line in utils.read_all_lines(filename): if q is not None: answer = line.split(' ') if answer_limit == 0 or len(answer) <= answer_limit: qas.append((q.split(' '), answer)) q = None else: q = line return qas
def load_vocab(filename, count): w2i = {NULL: NULL_ID, OOV: OOV_ID, SOS: SOS_ID, EOS: EOS_ID} i2c = {NULL_ID: 0, SOS_ID: 0, EOS_ID: 0} all_entries = list(utils.read_all_lines(filename)) count -= len(w2i) count = min(count, len(all_entries)) for line in all_entries[:count]: word, freq = line.rsplit(':', 1) id = len(w2i) w2i[word] = id i2c[id] = int(freq) i2w = {k: v for v, k in w2i.items()} i2c[OOV_ID] = len(all_entries) - count return w2i, i2w, i2c
def load_qa(filename, answer_limit=0): lines = [] r = [] for line in utils.read_all_lines(filename): if line == '<P>': passage = lines[0].split(' ') if len(''.join(passage)) <= config.max_passage_len: questions = [q.split(' ') for q in lines[1:] if len(q.replace(' ', '')) <= config.max_question_len] if questions: r.append((passage, questions)) lines.clear() else: lines.append(line) return r
def prepare_dataset_with_document(source, target): lines = [] for line in utils.read_all_lines(source): sample = json.loads(line) documents = [doc for doc in sample['documents'] if doc['is_selected']] questions = [doc['title'] for doc in documents] para_indices = [doc['most_related_para'] for doc in documents] answers = [ doc['paragraphs'][k] for doc, k in zip(documents, para_indices) ] for q, a in zip(questions, answers): lines.append(rip_marks(a)) lines.append(rip_marks(q)) lines.append('<P>') utils.write_all_lines(target, lines)
def load_vocab(filename, count): w2i = { config.NULL: config.NULL_ID, config.OOV: config.OOV_ID, config.SOS: config.SOS_ID, config.EOS: config.EOS_ID } count -= len(w2i) i2c = {} all_entries = list(utils.read_all_lines(filename)) for line in all_entries[:count]: word, freq = line.rsplit(':', 1) id = len(w2i) w2i[word] = id i2c[id] = freq i2w = {k: v for v, k in w2i.items()} i2c[config.OOV_ID] = len(all_entries) - count return w2i, i2w, i2c
def load_qa(filename): lines = [] r = [] skipped = 0 for line in utils.read_all_lines(filename): if line == '<P>': passage = lines[0].replace(' ', '') if config.min_limit <= len(passage) <= config.max_limit: questions = [process_question(q) for q in lines[1:]] if questions: r.append((passage, questions)) else: skipped += 1 lines.clear() else: lines.append(line) print('skipped {} records in {}'.format(skipped, filename)) return r
def analyze(source_path, test_class): all_lines = read_all_lines(source_path) paths = prefix_string_each(constants.DB_PATH, all_lines) paths = paths[int(len(paths)*0.7):len(paths)] # Get only 30% of data for testing correct_count = 0 for path in paths: files = read_all_files(path, '*.png') neutral = files[0] apex = files[len(files)-1] classification = classify(apex, neutral) classification_class = classification["class"] print(neutral) print(apex) print(classification_class) print('-------------------') if classification_class == test_class: correct_count += 1 print('Total count: ', len(paths)) print('Correct count for ' + test_class + ' is: ' + str(correct_count)) print('% correct: '+str(correct_count/len(paths)))
def prepare_dataset_with_document(source, target): lines = [] for line in utils.read_all_lines(source): sample = json.loads(line) documents = sample['documents'] questions = [sample['segmented_question'] ] + [doc['segmented_title'] for doc in documents] question_words = set(questions[0]) - stop_words questions = [' '.join(question) for question in questions] for doc in documents: for passage in doc['segmented_paragraphs']: passage_words = set(passage) - stop_words common = question_words & passage_words passage = rip_marks(' '.join(passage)) if len(common) / len(question_words) > 0.3 and len( passage) > 2 * len(questions[0]): lines.append(passage) lines += list(set(questions)) lines.append('<P>') utils.write_all_lines(target, lines)
def prepare_dataset_with_document(source, target): aqs = [] all = 0 for line in utils.read_all_lines(source): sample = json.loads(line) question = sample['segmented_question'] question_words = set(question) - stop_words for doc in sample['documents']: for answer in doc['segmented_paragraphs']: answer_words = set(answer) - stop_words common = question_words & answer_words if len(common) / len(question_words) > 0.3: a = rip_marks(str.join(' ', answer)) q = rip_marks(str.join(' ', question)) if len(a) > 2 * len(q): aqs.append((a, q)) all += 1 print('{}: {}/{} preprocessed'.format(source, len(aqs), all)) #utils.save_json(target, [{'q': q, 'a': a} for a,q in aqs]) utils.write_all_lines(target, ['{}\n{}\n'.format(q, a) for a, q in aqs]) return aqs
def __init__(self, opt): self.output_file = opt.summary_file self.lines = list(utils.read_all_lines(self.output_file))
import utils lines = list(utils.read_all_lines('./eval.csv'))[100:200] source = [] target = [] for line in lines: line = line.split('$') if len(source) == len(target): source.append(line) else: target.append(line) assert len(source) == len(target) lines = [] for s, t in zip(source, target): assert len(s) == len(t) line = '' for x, y in zip(s, t): if y.startswith('S-'): line += f'<{y[2:]}>{x}</{y[2:]}>' elif y.startswith('B-'): line += f'<{y[2:]}>{x}' elif y.startswith('E-'): line += f'{x}</{y[2:]}>' else: line += x lines.append(line)
import utils import re lines = [] for line in utils.read_all_lines('eval.processed.txt'): source = [] target = [] def process_others(start, end): for c in line[start:end]: source.append(c) target.append('O') def append_source(span): part = line[span[0]:span[1]] for c in part: source.append(c) def append_target(span, source_span): slen = source_span[1] - source_span[0] tag = line[span[0]:span[1]].upper() global target if slen == 1: target.append('S-' + tag) else: target += ['S-' + tag] + ['M-' + tag] * (slen - 2) + ['E-' + tag] def join(tp): return '$'.join(tp) last_pos = 0
import json import os import utils import config import re import data from collections import defaultdict stop_words = set(utils.read_all_lines(config.stopwords_file)) def create_question_vocab(filename): vocab = defaultdict(lambda: 0) for line in utils.read_all_lines(filename): sample = json.loads(line) question = sample['segmented_question'] for word in question: vocab[word] += 1 vocab = sorted(vocab.items(), key=lambda x: -x[1]) utils.write_all_lines(config.question_vocab_file, ['{}:{}'.format(w, c) for w, c in vocab]) def create_answer_vocab(filename): vocab = defaultdict(lambda: 0) for line in utils.read_all_lines(filename): sample = json.loads(line) for doc in sample['documents']: for answer in doc['segmented_paragraphs']: for word in answer: vocab[word] += 1