def train(args, model, corpus, device, criterion): # At any point you can hit Ctrl + C to break out of training early. lr = args.lr best_val_loss = None train_data = batchify(corpus.train, args.batch_size).to(device) valid_data = batchify(corpus.valid, args.batch_size).to(device) if not os.path.exists(LOG_FOLDER): print(f'Creatnig folder {LOG_FOLDER}') os.makedirs(LOG_FOLDER) try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() _train_epoch(args, epoch, model, train_data, corpus, device, lr, criterion) val_loss = evaluate(args, valid_data, model, corpus, criterion) # val_loss = 0. print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open(args.save, 'wb') as f: torch.save(model, f) best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 # Persist after each epoch with open(f"./models/m_{args.type}_{epoch}.pkl", 'wb') as f: torch.save(model, f) # Inference a text after each epoch inference_text = inference(args, model, corpus, device) print(f'Generated text: {inference_text}') inject_summary_text(summary_writer, f'Inference_{epoch}', inference_text, epoch) # Log to tensorboard info = { 'validation/loss/val_loss': val_loss, 'validation/loss/val_loss_exp': math.exp(val_loss), } for tag, value in info.items(): inject_summary(summary_writer, tag, value, epoch) summary_writer.flush() except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
async def slowtype(client, message): args = message.command if "arg" not in args: return logger.info(f"Making text appear slowly") interval = 0.5 batchsize = 1 if "time" in args: interval = float(args["time"]) if "batch" in args: batchsize = int(args["batch"]) msg = "" try: for seg in batchify(args["arg"], batchsize): msg += seg if seg.isspace() or seg == "": continue # important because sending same message twice causes an exception t = asyncio.sleep( interval) # does this "start" the coroutine early? await message.edit(msg) await client.send_chat_action(message.chat.id, "typing") await t # does this work? I should read asyncio docs except: traceback.print_exc() pass # msg was deleted probably await client.send_chat_action(message.chat.id, "cancel")
async def lookup_deleted_messages(client, message, target_group, limit, show_time=False, include_system=False, offset=0): response = await edit_or_reply( message, f"` → Peeking {limit} message{'s' if limit > 1 else ''} " + ('in ' + get_channel(target_group) if target_group is not None else '') + "`") chat_id = target_group.id if target_group is not None else None out = "\n\n" count = 0 LINE = "{time}`[{m_id}]` **{user}** {where} → {system}{text} {media}\n" try: lgr.debug("Querying db for deletions") await client.send_chat_action(message.chat.id, "upload_document") cursor = EVENTS.find({"_": "Delete"}).sort("date", -1) for deletion in cursor: # TODO make this part not a f*****g mess! if chat_id is not None and "chat" in deletion \ and deletion["chat"]["id"] != chat_id: continue # don't make a 2nd query, should speed up a ton candidates = EVENTS.find({ "_": "Message", "message_id": deletion["message_id"] }).sort("date", -1) lgr.debug("Querying db for possible deleted msg") for doc in candidates: # dank 'for': i only need one if chat_id is not None and doc["chat"]["id"] != chat_id: continue if not include_system and "service" in doc and doc["service"]: break # we don't care about service messages! if not include_system and "from_user" in doc and doc[ "from_user"]["is_bot"]: break # we don't care about bot messages! if offset > 0: # We found a message but we don't want it because an offset was set offset -= 1 # skip adding this to output break if limit == 1 and "attached_file" in doc: # Doing this here forces me to do ugly stuff below, eww! await client.send_document( message.chat.id, "data/scraped_media/" + doc["attached_file"], reply_to_message_id=message.message_id, caption="**" + (get_username_dict(doc['from_user']) if "from_user" in doc else "UNKNOWN") + "** `→" + (get_channel_dict(doc['chat']) + ' → ' if chat_id is None else '') + f"` {get_text_dict(doc)['raw']}") else: out += LINE.format( time=(str(doc["date"]) + " ") if show_time else "", m_id=doc["message_id"], user=(get_username_dict(doc["from_user"]) if "from_user" in doc else "UNKNOWN"), where='' if chat_id is not None else ("| --" + get_channel_dict(doc["chat"]) + '-- '), system=("--" + parse_sys_dict(doc) + "-- " if "service" in doc and doc["service"] else ""), text=get_text_dict(doc)['raw'], media=('' if "attached_file" not in doc else ('(`' + doc["attached_file"] + '`)'))) count += 1 break if count >= limit: break if count > 0: if len(out) > 4096: for m in batchify(out, 4090): await response.reply(m) elif out.strip() != "": # This is bad! await response.edit(response.text.markdown + out) else: await response.edit(response.text.markdown + "**N/A**") except Exception as e: traceback.print_exc() await response.edit(response.text.markdown + "\n`[!] → ` " + str(e)) await client.send_chat_action(message.chat.id, "cancel") await client.set_offline()
#torch.cuda.set_device(2) torch.cuda.manual_seed(args.seed) ############################################################################### # Load var data ############################################################################### #### fix this # corpus = data.Corpus(args.data) corpus = data.Corpus(args) print('Train set size = ', len(corpus.train_data), len(corpus.train_label)) print('Test set size = ', len(corpus.test_data), len(corpus.test_label)) print('Vocabulary size = ', len(corpus.dictionary)) train_var_data_trimed, train_var_label_trimed = util.batchify( corpus.train_data, corpus.train_label, args.batch_size, args.cuda) #[82915, 20] batch size = 20, it's kinda seq len in gen sense valid_var_data_trimed, valid_var_label_trimed = util.batchify( corpus.valid_data, corpus.valid_label, args.batch_size, args.cuda) #[82915, 20] batch size = 20, it's kinda seq len in gen sense test_var_data_trimed, test_var_label_trimed = util.batchify( corpus.test_data, corpus.test_label, args.batch_size, args.cuda) #[82915, 20] batch size = 20, it's kinda seq len in gen sense ############################################################################### # Load type data ############################################################################### args.train_data = args.train_data.rstrip('.data') + '_type.data' args.valid_data = args.valid_data.rstrip('test.data') + 'test_type.data' args.test_data = args.test_data.rstrip('test.data') + 'test_type.data'
def fit( model, W, y, lr, lambd, num_epochs, batch_size, check, version, W_val, y_val, device, y_thresh, c_thresh, ): """ Fit model on W (count data), y (targets) w/: - lr: initial learning rate - lambd: supervised task regularizer weight - num_epochs and batch_size - version: specifies modeling targets as Normal (real) or Benoulli (binary) Every check epochs, calc and print topic coherence, val yscore (based on W_val, and y_val). If y_thresh and c_thresh specified, save model when val yscore or coherence are better than their thresholds. O/w save final model after num_epochs. """ print(f"Training {model.name} on {device}.") opt = torch.optim.Adam(model.parameters(), lr=lr) for i in range(num_epochs): # batch necessary parts of data to_batch = [W, model.phi, model.gamma, y] batches = batchify(to_batch, batch_size) W_b, phi_b, gamma_b, y_b = batches[0], batches[1], batches[2], batches[ 3] tot = 0 for j in range(len(W_b)): opt.zero_grad() elbo = model.ELBO( W_b[j].to(device), phi_b[j], gamma_b[j], y_b[j].to(device), version=version, ) tot += elbo.item() loss = -1 * elbo + lambd * (model.eta**2).sum() loss.backward() opt.step() if i % check == 0: val_yscore, c = calc_stats_and_print(model, W, W_val.to(device), y_val.to(device), tot / W.sum(), i, version) save = False if (y_thresh and val_yscore < y_thresh) or (c_thresh and c > c_thresh): save = True if save: path = f"models/{model.name}_ y{val_yscore:.2f}_c{c:.2f}.pt" torch.save(model.state_dict(), path) # save last model if no thresholds if not y_thresh and not c_thresh: val_yscore, c = calc_stats_and_print( model, W, W_val.to(device), y_val.to(device), tot / W.sum(), num_epochs, version, ) path = f"models/{model.name}_ y{val_yscore:.2f}_c{c:.2f}.pt" torch.save(model.state_dict(), path) return
async def pasta_cmd(client, message): """drop a copypasta Give copypasta name or path to any file containing long text and bot will drop it in chat. List all saved copypastas with `-list` flag. Use flag `-stop` to stop ongoing pasta. A separator can be specified with `-s` to split the copypasta (for example, at newlines `\\n`). Long messages will still be split in chunks of 4096 characters due to telegram limit. Messages will be sent at an interval of 1 second by default. A different interval can be specified with `-i`. Add flag `-mono` to print pasta monospaced. Add flag `-edit` to always edit the first message instead of sending new ones. Reply to a message while invoking this command to have all pasta chunks reply to that message. """ if message.command["-stop"]: client.ctx.INTERRUPT_PASTA = True return if message.command["-list"]: return await edit_or_reply(message,\ "\n".join(f"` → ` {pasta}" for pasta in os.listdir("plugins/alemibot-tricks/data/pasta/")) ) if len(message.command) < 1: return await edit_or_reply(message, "`[!] → ` No input") repl_id = None if message.reply_to_message: repl_id = message.reply_to_message.message_id sep = message.command["separator"] intrv = float(message.command["interval"] or 1.0) monospace = bool(message.command["-mono"]) edit_this = await client.send_message(message.chat.id, "` → ` Starting", reply_to_message_id=repl_id) \ if bool(message.command["-edit"]) else None p_mode = 'html' if monospace else None # Find correct path path = message.command[0] try: pattern = re.compile(message.command[0]) for pasta in os.listdir("plugins/alemibot-tricks/data/pasta"): if pattern.match(pasta): path = f"plugins/alemibot-tricks/data/pasta/{pasta}" break except re.error: pass # load text, make it a list so it's iterable with open(path, "rb") as f: text = [ f.read().decode('utf-8', 'ignore') ] # apply separator if requested if sep: text = re.split(sep, text[0]) with ProgressChatAction(client, message.chat.id, action="typing") as prog: for section in text: for chunk in batchify(section, 4096): if len(chunk.strip()) < 1: continue if monospace: chunk = "<code>" + html.escape(chunk) + "</code>" if edit_this: await edit_this.edit(chunk, parse_mode=p_mode) else: await client.send_message(message.chat.id, chunk, parse_mode=p_mode, reply_to_message_id=repl_id) await asyncio.sleep(intrv) if client.ctx.INTERRUPT_PASTA: client.ctx.INTERRUPT_PASTA = False raise Exception("Interrupted by user") if edit_this: await edit_this.edit("` → ` Done")