class SampledTrainer(Trainer): def __init__(self): parser = create_parser(usage) super().__init__(parser) if self.args.invert_samples: parser.error('--invert-samples should be left blank') self.args.samples_file = (self.args.samples_file or '{model_base}.samples.json').format( model_base=self.model_base) self.samples, self.hash_to_ind = self.load_sample_data( self.args.samples_file, self.train) self.metrics_fiti = Fitipy(self.model_base + '.logs', 'sampling-metrics.txt') def write_sampling_metrics(self, predicted): correct = float( sum((predicted > 0.5) == (self.train[1] > 0.5)) / len(self.train[1])) print('Successfully calculated: {0:.3%}'.format(correct)) lines = self.metrics_fiti.read().lines() lines.append('{}\t{}'.format( len(self.samples) / len(self.train[1]), correct)) self.metrics_fiti.write().lines(lines) def choose_new_samples(self, predicted): failed_samples = { calc_sample_hash(inp, target) for i, (inp, pred, target) in enumerate( zip(self.train[0], predicted, self.train[1])) if (pred > 0.5) != (target > 0.5) } remaining_failed_samples = failed_samples - self.samples print('Remaining failed samples:', len(remaining_failed_samples)) return islice(remaining_failed_samples, self.args.num_sample_chunk) def run(self): print('Writing to:', self.args.samples_file) print('Writing metrics to:', self.metrics_fiti.path) for _ in range(self.args.cycles): print('Calculating on whole dataset...') predicted = self.model.predict(self.train[0]) self.samples.update(self.choose_new_samples(predicted)) Fitipy(self.args.samples_file).write().set(self.samples) print('Added', self.args.num_sample_chunk, 'samples') self.write_sampling_metrics(predicted) self.model.fit(*self.sampled_data, batch_size=self.args.batch_size, epochs=self.epoch + self.args.epochs, callbacks=self.callbacks, initial_epoch=self.epoch, validation_data=self.test)
class FilesystemService(ServicePlugin): def __init__(self, rt, root=None): ServicePlugin.__init__(self, rt) self.root = root or expanduser(rt.paths.user_config) self.fiti = Fitipy(self.root) if not self.isdir(''): self.mkdir('') def read(self, *path) -> FitiReader: return self.fiti.read(*path) def write(self, *path) -> FitiWriter: return self.fiti.write(*path) def subdir(self, *path): return FilesystemService(self.rt, join(self.root, *path)) def open(self, *path, mode='r'): return open(join(self.root, *path), mode) def isfile(self, *path): return isfile(join(self.root, *path)) def isdir(self, *path): return isdir(join(self.root, *path)) def mkdir(self, *path): makedirs(self.path(*path), exist_ok=True) def path(self, *path): return join(self.root, *path)
class SavedJson: """Dict that saves to disk on modifications""" def __init__(self, filename): super().__init__() self.filename = filename self.fiti = Fitipy(self.filename) self.data = self.fiti.read().dict() def __getitem__(self, item): return self.data[item] def __setitem__(self, key, value): changed = self.data.get(key, ...) != value self.data[key] = value if changed: self.fiti.write().dict(self.data) def __delitem__(self, key): del self.data[key] self.fiti.write().dict(self.data) def get(self, k): return self.data.get(k) def update(self, data=None, **kwargs): self.data.update(data, **kwargs) self.fiti.write().dict(self.data)
class TrainSampledScript(TrainScript): usage = Usage(''' Train a model, sampling data points with the highest loss from a larger dataset :-c --cycles int 200 Number of sampling cycles of size {epoch} to run :-n --num-sample-chunk int 50 Number of new samples to introduce at a time between training cycles :-sf --samples-file str - Json file to write selected samples to. Default = {model_base}.samples.json :-is --invert-samples Unused parameter ... ''') | TrainScript.usage def __init__(self, args): super().__init__(args) if self.args.invert_samples: raise ValueError('--invert-samples should be left blank') self.args.samples_file = (self.args.samples_file or '{model_base}.samples.json').format( model_base=self.model_base) self.samples, self.hash_to_ind = self.load_sample_data( self.args.samples_file, self.train) self.metrics_fiti = Fitipy(self.model_base + '.logs', 'sampling-metrics.txt') def write_sampling_metrics(self, predicted): correct = float( sum((predicted > 0.5) == (self.train[1] > 0.5)) / len(self.train[1])) print('Successfully calculated: {0:.3%}'.format(correct)) lines = self.metrics_fiti.read().lines() lines.append('{}\t{}'.format( len(self.samples) / len(self.train[1]), correct)) self.metrics_fiti.write().lines(lines) def choose_new_samples(self, predicted): failed_samples = { calc_sample_hash(inp, target) for i, (inp, pred, target) in enumerate( zip(self.train[0], predicted, self.train[1])) if (pred > 0.5) != (target > 0.5) } remaining_failed_samples = failed_samples - self.samples print('Remaining failed samples:', len(remaining_failed_samples)) return islice(remaining_failed_samples, self.args.num_sample_chunk) def run(self): print('Writing to:', self.args.samples_file) print('Writing metrics to:', self.metrics_fiti.path) for _ in range(self.args.cycles): print('Calculating on whole dataset...') predicted = self.model.predict(self.train[0]) self.samples.update(self.choose_new_samples(predicted)) Fitipy(self.args.samples_file).write().set(self.samples) print('Added', self.args.num_sample_chunk, 'samples') self.write_sampling_metrics(predicted) self.model.fit(*self.sampled_data, batch_size=self.args.batch_size, epochs=self.epoch + self.args.epochs, callbacks=self.callbacks, initial_epoch=self.epoch, validation_data=self.test)
def main(): args = create_parser(usage).parse_args() num_seen_file = Fitipy(args.cache_file + '.num') topics_cache = args.cache_file + '.topics.json' if not isfile(topics_cache): print('Generating topics...') with open(topics_cache, 'w') as f: json.dump(get_keywords_uiuc(), f) with open(topics_cache) as f: topics = json.load(f) num_seen = num_seen_file.read().read(0, int) with open(args.auth_file) as f: auth = yaml.load(f) email = auth['username'] password = auth['password'] server = auth.get('pop3_host', 'pop3.' + email.split('@')[-1]) client = StatelessClass(EmailReceiver, email=email, password=password, server=server) # type: EmailReceiver print('Waiting for emails...') while True: num_messages = len(client.get_list()) if num_messages < num_seen: num_seen = num_messages num_seen_file.write().write(num_seen) if num_messages <= num_seen: time.sleep(1) continue for msg_id in range(num_seen + 1, num_messages + 1): email = client.get_email(msg_id) print('Found new email from {} titled {}.'.format( email['From'], email['Subject'])) email_txt = '\n'.join(email['text']) email_txt = BeautifulSoup(email_txt).text email_txt = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))', '', email_txt) freq = calc_freq(email_txt, topics) tags = relevant_topics(freq) print('Found the following tags:', ', '.join(tags)) events = Event.find() matched_events = [ event for event in events if event.get('emailSrc') and SequenceMatcher(a=event['emailSrc'], b=email_txt).ratio() > 0.9 ] if matched_events: print('Ignoring, similar to {} other emails'.format( len(matched_events))) else: Event.add({ 'name': email['Subject'], 'description': email_txt, 'location': '', 'time': int(time.time()), 'tags': tags, 'emailSrc': email_txt }) num_seen += 1 num_seen_file.write().write(num_seen, str)