Beispiel #1
0
def train(args, model, corpus, device, criterion):
    # At any point you can hit Ctrl + C to break out of training early.
    lr = args.lr
    best_val_loss = None

    train_data = batchify(corpus.train, args.batch_size).to(device)
    valid_data = batchify(corpus.valid, args.batch_size).to(device)

    if not os.path.exists(LOG_FOLDER):
        print(f'Creatnig folder {LOG_FOLDER}')
        os.makedirs(LOG_FOLDER)

    try:
        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            _train_epoch(args, epoch, model, train_data, corpus, device, lr,
                         criterion)
            val_loss = evaluate(args, valid_data, model, corpus, criterion)
            # val_loss = 0.
            print('-' * 89)
            print(
                '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch,
                                           (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
            print('-' * 89)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                with open(args.save, 'wb') as f:
                    torch.save(model, f)
                best_val_loss = val_loss
            else:
                # Anneal the learning rate if no improvement has been seen in the validation dataset.
                lr /= 4.0

            # Persist after each epoch
            with open(f"./models/m_{args.type}_{epoch}.pkl", 'wb') as f:
                torch.save(model, f)

            # Inference a text after each epoch
            inference_text = inference(args, model, corpus, device)
            print(f'Generated text: {inference_text}')
            inject_summary_text(summary_writer, f'Inference_{epoch}',
                                inference_text, epoch)

            # Log to tensorboard
            info = {
                'validation/loss/val_loss': val_loss,
                'validation/loss/val_loss_exp': math.exp(val_loss),
            }

            for tag, value in info.items():
                inject_summary(summary_writer, tag, value, epoch)

            summary_writer.flush()

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')
Beispiel #2
0
async def slowtype(client, message):
    args = message.command
    if "arg" not in args:
        return
    logger.info(f"Making text appear slowly")
    interval = 0.5
    batchsize = 1
    if "time" in args:
        interval = float(args["time"])
    if "batch" in args:
        batchsize = int(args["batch"])
    msg = ""
    try:
        for seg in batchify(args["arg"], batchsize):
            msg += seg
            if seg.isspace() or seg == "":
                continue  # important because sending same message twice causes an exception
            t = asyncio.sleep(
                interval)  # does this "start" the coroutine early?
            await message.edit(msg)
            await client.send_chat_action(message.chat.id, "typing")
            await t  # does this work? I should read asyncio docs
    except:
        traceback.print_exc()
        pass  # msg was deleted probably
    await client.send_chat_action(message.chat.id, "cancel")
Beispiel #3
0
async def lookup_deleted_messages(client,
                                  message,
                                  target_group,
                                  limit,
                                  show_time=False,
                                  include_system=False,
                                  offset=0):
    response = await edit_or_reply(
        message, f"` → Peeking {limit} message{'s' if limit > 1 else ''} " +
        ('in ' + get_channel(target_group) if target_group is not None else '')
        + "`")
    chat_id = target_group.id if target_group is not None else None
    out = "\n\n"
    count = 0
    LINE = "{time}`[{m_id}]` **{user}** {where} → {system}{text} {media}\n"
    try:
        lgr.debug("Querying db for deletions")
        await client.send_chat_action(message.chat.id, "upload_document")
        cursor = EVENTS.find({"_": "Delete"}).sort("date", -1)
        for deletion in cursor:  # TODO make this part not a f*****g mess!
            if chat_id is not None and "chat" in deletion \
            and deletion["chat"]["id"] != chat_id:
                continue  # don't make a 2nd query, should speed up a ton
            candidates = EVENTS.find({
                "_": "Message",
                "message_id": deletion["message_id"]
            }).sort("date", -1)
            lgr.debug("Querying db for possible deleted msg")
            for doc in candidates:  # dank 'for': i only need one
                if chat_id is not None and doc["chat"]["id"] != chat_id:
                    continue
                if not include_system and "service" in doc and doc["service"]:
                    break  # we don't care about service messages!
                if not include_system and "from_user" in doc and doc[
                        "from_user"]["is_bot"]:
                    break  # we don't care about bot messages!
                if offset > 0:  # We found a message but we don't want it because an offset was set
                    offset -= 1  #   skip adding this to output
                    break
                if limit == 1 and "attached_file" in doc:  # Doing this here forces me to do ugly stuff below, eww!
                    await client.send_document(
                        message.chat.id,
                        "data/scraped_media/" + doc["attached_file"],
                        reply_to_message_id=message.message_id,
                        caption="**" + (get_username_dict(doc['from_user'])
                                        if "from_user" in doc else "UNKNOWN") +
                        "** `→" + (get_channel_dict(doc['chat']) +
                                   ' → ' if chat_id is None else '') +
                        f"` {get_text_dict(doc)['raw']}")
                else:
                    out += LINE.format(
                        time=(str(doc["date"]) + " ") if show_time else "",
                        m_id=doc["message_id"],
                        user=(get_username_dict(doc["from_user"])
                              if "from_user" in doc else "UNKNOWN"),
                        where='' if chat_id is not None else
                        ("| --" + get_channel_dict(doc["chat"]) + '-- '),
                        system=("--" + parse_sys_dict(doc) + "-- " if
                                "service" in doc and doc["service"] else ""),
                        text=get_text_dict(doc)['raw'],
                        media=('' if "attached_file" not in doc else
                               ('(`' + doc["attached_file"] + '`)')))
                count += 1
                break
            if count >= limit:
                break
        if count > 0:
            if len(out) > 4096:
                for m in batchify(out, 4090):
                    await response.reply(m)
            elif out.strip() != "":  # This is bad!
                await response.edit(response.text.markdown + out)
        else:
            await response.edit(response.text.markdown + "**N/A**")
    except Exception as e:
        traceback.print_exc()
        await response.edit(response.text.markdown + "\n`[!] → ` " + str(e))
    await client.send_chat_action(message.chat.id, "cancel")
    await client.set_offline()
Beispiel #4
0
        #torch.cuda.set_device(2)
        torch.cuda.manual_seed(args.seed)

###############################################################################
# Load var data
###############################################################################

#### fix this
# corpus = data.Corpus(args.data)
corpus = data.Corpus(args)
print('Train set size = ', len(corpus.train_data), len(corpus.train_label))
print('Test set size = ', len(corpus.test_data), len(corpus.test_label))
print('Vocabulary size = ', len(corpus.dictionary))

train_var_data_trimed, train_var_label_trimed = util.batchify(
    corpus.train_data, corpus.train_label, args.batch_size,
    args.cuda)  #[82915, 20] batch size = 20, it's kinda seq len in gen sense
valid_var_data_trimed, valid_var_label_trimed = util.batchify(
    corpus.valid_data, corpus.valid_label, args.batch_size,
    args.cuda)  #[82915, 20] batch size = 20, it's kinda seq len in gen sense
test_var_data_trimed, test_var_label_trimed = util.batchify(
    corpus.test_data, corpus.test_label, args.batch_size,
    args.cuda)  #[82915, 20] batch size = 20, it's kinda seq len in gen sense

###############################################################################
# Load type data
###############################################################################

args.train_data = args.train_data.rstrip('.data') + '_type.data'
args.valid_data = args.valid_data.rstrip('test.data') + 'test_type.data'
args.test_data = args.test_data.rstrip('test.data') + 'test_type.data'
Beispiel #5
0
def fit(
    model,
    W,
    y,
    lr,
    lambd,
    num_epochs,
    batch_size,
    check,
    version,
    W_val,
    y_val,
    device,
    y_thresh,
    c_thresh,
):
    """
    Fit model on W (count data), y (targets) w/:
        - lr: initial learning rate
        - lambd: supervised task regularizer weight
        - num_epochs and batch_size
        - version: specifies modeling targets as Normal (real)
                   or Benoulli (binary)
  
    Every check epochs, calc and print topic coherence, val yscore
    (based on W_val, and y_val). If y_thresh and c_thresh specified,
    save model when val yscore or coherence are better than their 
    thresholds. O/w save final model after num_epochs.
    """
    print(f"Training {model.name} on {device}.")

    opt = torch.optim.Adam(model.parameters(), lr=lr)

    for i in range(num_epochs):
        # batch necessary parts of data
        to_batch = [W, model.phi, model.gamma, y]
        batches = batchify(to_batch, batch_size)
        W_b, phi_b, gamma_b, y_b = batches[0], batches[1], batches[2], batches[
            3]

        tot = 0
        for j in range(len(W_b)):
            opt.zero_grad()
            elbo = model.ELBO(
                W_b[j].to(device),
                phi_b[j],
                gamma_b[j],
                y_b[j].to(device),
                version=version,
            )
            tot += elbo.item()
            loss = -1 * elbo + lambd * (model.eta**2).sum()
            loss.backward()
            opt.step()

        if i % check == 0:
            val_yscore, c = calc_stats_and_print(model, W, W_val.to(device),
                                                 y_val.to(device),
                                                 tot / W.sum(), i, version)

            save = False
            if (y_thresh and val_yscore < y_thresh) or (c_thresh
                                                        and c > c_thresh):
                save = True
            if save:
                path = f"models/{model.name}_ y{val_yscore:.2f}_c{c:.2f}.pt"
                torch.save(model.state_dict(), path)

    # save last model if no thresholds
    if not y_thresh and not c_thresh:
        val_yscore, c = calc_stats_and_print(
            model,
            W,
            W_val.to(device),
            y_val.to(device),
            tot / W.sum(),
            num_epochs,
            version,
        )
        path = f"models/{model.name}_ y{val_yscore:.2f}_c{c:.2f}.pt"
        torch.save(model.state_dict(), path)

    return
Beispiel #6
0
async def pasta_cmd(client, message):
	"""drop a copypasta

	Give copypasta name or path to any file containing long text and bot will drop it in chat.
	List all saved copypastas with `-list` flag.
	Use flag `-stop` to stop ongoing pasta.
	A separator can be specified with `-s` to split the copypasta (for example, at newlines `\\n`).
	Long messages will still be split in chunks of 4096 characters due to telegram limit.
	Messages will be sent at an interval of 1 second by default. A different interval can be specified with `-i`.
	Add flag `-mono` to print pasta monospaced.
	Add flag `-edit` to always edit the first message instead of sending new ones.
	Reply to a message while invoking this command to have all pasta chunks reply to that message.
	"""
	if message.command["-stop"]:
		client.ctx.INTERRUPT_PASTA = True
		return
	if message.command["-list"]:
		return await edit_or_reply(message,\
			"\n".join(f"` → ` {pasta}" for pasta in os.listdir("plugins/alemibot-tricks/data/pasta/"))
		)
	if len(message.command) < 1:
		return await edit_or_reply(message, "`[!] → ` No input")
	repl_id = None
	if message.reply_to_message:
		repl_id = message.reply_to_message.message_id
	sep = message.command["separator"]
	intrv = float(message.command["interval"] or 1.0)
	monospace = bool(message.command["-mono"])
	edit_this = await client.send_message(message.chat.id, "` → ` Starting", reply_to_message_id=repl_id) \
			if bool(message.command["-edit"]) else None
	p_mode = 'html' if monospace else None
	# Find correct path
	path = message.command[0]
	try:
		pattern = re.compile(message.command[0])
		for pasta in os.listdir("plugins/alemibot-tricks/data/pasta"):
			if pattern.match(pasta):
				path = f"plugins/alemibot-tricks/data/pasta/{pasta}"
				break
	except re.error:
		pass
	# load text, make it a list so it's iterable
	with open(path, "rb") as f:
		text = [ f.read().decode('utf-8', 'ignore') ]
	# apply separator if requested
	if sep:
		text = re.split(sep, text[0])
	with ProgressChatAction(client, message.chat.id, action="typing") as prog:
		for section in text:
			for chunk in batchify(section, 4096):
				if len(chunk.strip()) < 1:
					continue
				if monospace:
					chunk = "<code>" + html.escape(chunk) + "</code>"
				if edit_this:
					await edit_this.edit(chunk, parse_mode=p_mode)
				else:
					await client.send_message(message.chat.id, chunk, parse_mode=p_mode, reply_to_message_id=repl_id)
				await asyncio.sleep(intrv)
				if client.ctx.INTERRUPT_PASTA:
					client.ctx.INTERRUPT_PASTA = False
					raise Exception("Interrupted by user")
		if edit_this:
			await edit_this.edit("` → ` Done")