async def on_message_delete(self, msg): """Log each message deleted. Log message provides information about author, content and date of the message. """ log_msg = (f"{msg.author} has deleted his message: " f"{msg.content!r} sent at {msg.created_at}") logger.info(log_msg)
def tag_mapping(sentences): """ 建立标签和id对应的字典,按频率降序排列 由于用了CRF,所以需要在标签前后加<start>和<end> 但是torchcrf那个包会自动处理,那么在字典中不用加入这两个标记 """ f = open('data/tag_to_id.txt','w',encoding='utf8') f1 = open('data/id_to_tag.txt','w',encoding='utf8') tags = [[x[-1] for x in s] for s in sentences] dico = create_dico(tags) dico["<pad>"] = 100000002 #为了不改变my_pytorch_crf模型,添加START,END dico["<start>"] = 100000003 dico["<stop>"] = 100000004 tag_to_id, id_to_tag = create_mapping(dico) logger.info("Found %i unique named entity tags" % len(dico)) for k,v in tag_to_id.items(): f.write(k+":"+str(v)+"\n") for k,v in id_to_tag.items(): f1.write(str(k)+":"+str(v)+"\n") return dico, tag_to_id, id_to_tag
async def login(request: Request) -> json_response: try: user = await request.json() if 8 > len(user['email']) > 20: return failure_response(400, 'Invalid email length') if 8 > len(user['password']) > 20: return failure_response(400, 'Invalid password length') pool = request.app['pool'] async with pool.acquire() as conn: users = await conn.fetch( 'SELECT * FROM users WHERE email=\'{}\''.format(user['email'])) if len(users) == 0: return failure_response(401, 'Invalid email or password') else: if users[0]['password'] != encode( str(user['password']).encode('utf-8')): return failure_response(401, 'Invalid email or password') else: token = get_token({ 'user': user_record_to_json(users[0]), 'key': users[0]['key'] }) logger.info('User {} login at site'.format(user['email'])) return success_response(200, 'ok', token=token) except Exception as e: return server_error_response(e)
async def on_command(self, msg): """Log each command submitted. Log message provides information about name, author and arguments of the command. """ args = msg.args[2:] args_info = ', '.join(repr(arg) for arg in args) if args else "" log_msg = (f"{msg.command.name} called by {msg.author} with " f"args {args_info}.") logger.info(log_msg)
def char_mapping(sentences, lower): """ 建立字和id对应的字典,按频率降序排列 """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<pad>"] = 100000003 dico['<unk>'] = 100000002 char_to_id, id_to_char = create_mapping(dico) logger.info("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def evaluate(model, data, id_to_tag, test=False): """ 得到预测的标签(非id)和损失 """ ner_results, aver_loss = evaluate_helper(model, data, id_to_tag) """ 用CoNLL-2000的实体识别评估脚本来计算F1值 """ eval_lines = test_ner(ner_results, config.save_dir) if test: """ 如果是测试,则打印评估结果 """ for line in eval_lines: logger.info(line) f1 = float(eval_lines[1].strip().split()[-1]) / 100 return f1, aver_loss
async def check(request: Request) -> json_response: try: pool = request.app['pool'] async with pool.acquire() as conn: tr = await conn.fetch('SELECT * FROM transfers') for i in range(1, len(tr)): if tr[i]['transfer_hash'] == encode( transfer_record_to_json_string(tr[i - 1]).encode('utf-8')): logger.info(str(tr[i - 1]['id']) + ' : OK') else: logger.info(str(tr[i - 1]['id']) + ' : WRONG') return success_response(200, 'ok') except Exception as e: return server_error_response(e)
def char_mapping(sentences, lower): """ 建立字和id对应的字典,按频率降序排列 由于用了CRF,所以需要在句子前后加<start>和<end> 那么在字典中也加入这两个标记 """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<pad>"] = 100000003 dico['<unk>'] = 100000002 dico["<start>"] = 100000001 dico["<end>"] = 100000000 char_to_id, id_to_char = create_mapping(dico) logger.info("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path): """ 预训练字向量中的字,如果不在训练集的字典中,就加入,拓展字典。 """ logger.info('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) """ 加载预训练的字向量 """ pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) for char in pretrained: if char not in dictionary: dictionary[char] = 0 char_to_id, id_to_char = create_mapping(dictionary) return dictionary, char_to_id, id_to_char
async def register(request: Request) -> json_response: try: user = await request.json() if 8 > len(user['name']) > 20: return failure_response(400, 'Invalid name length') if 8 > len(user['email']) > 20: return failure_response(400, 'Invalid email length') if 8 > len(user['password']) > 20: return failure_response(400, 'Invalid email length') if 3 > len(user['country']) > 15: return failure_response(400, 'Invalid country') if user['age'] is None or 6 > int(user['age']) > 65: return failure_response(400, 'Invalid age') pool = request.app['pool'] async with pool.acquire() as conn: async with conn.transaction(): usr = await conn.fetch( 'SELECT * FROM users WHERE email=\'{}\''.format( user['email'])) if len(usr) == 0: password_hash = encode( str(user['password']).encode('utf-8')) u = User(name=user['name'], email=user['email'], password=password_hash, age=user['age'], country=user['country']) await conn.fetch( '''INSERT INTO users (name,email,password,age,country,balance,key) VALUES (\'{}\', \'{}\', \'{}\', {}, \'{}\', {}, \'{}\')''' .format(u.name, u.email, u.password, u.age, u.country, u.balance, u.key)) token = get_token({'user': u.to_json(), 'key': u.key}) logger.info( 'Registrate new User(name={}, email={}, age={}, country={})' .format(u.name, u.email, u.age, u.country)) return success_response(200, 'ok', token=token) else: return failure_response(200, 'This login is already taken') except Exception as e: return server_error_response(e)
async def delete_user(request: Request) -> json_response: try: data = await request.json() if data['key'] is None or len(data['key']) < 10: return failure_response(401, 'Authorize please') pool = request.app['pool'] async with pool.acquire() as conn: async with conn.transaction(): users = await conn.fetch( 'SELECT * FROM users WHERE key=\'{}\''.format(data['key'])) if len(users) == 0: return failure_response(401, 'Authorize please') else: await conn.fetch(''' DELETE FROM users WHERE email=\'{}\'; DELETE FROM transfers WHERE master=\'{}\' '''.format(users[0]['email'], users[0]['email'])) logger.info('User {} was deleted (key={})'.format( users[0]['email'], data['key'])) return success_response(200, 'ok') except Exception as e: return server_error_response(e)
async def check_csgo_rss(channel): global title_in_memory await asyncio.sleep(120) logger.info("Checking CSGO blog for update...") latest_title, update_text = get_latest_update() logger.info(latest_title) if title_in_memory != latest_title: logger.info( "New CSGO update found titled \"%s\". Sending message to %s...", latest_title, channel) await send_message(update_text, channel) write_latest_title(latest_title) title_in_memory = latest_title else: logger.info("No new update found.")
def train(): """ 1: 加载数据集,把样本和标签都转化为id""" if os.path.isfile(config.data_proc_file): with open(config.data_proc_file, "rb") as f: train_data,dev_data,test_data = pickle.load(f) char_to_id,id_to_char,tag_to_id,id_to_tag = pickle.load(f) emb_matrix = pickle.load(f) logger.info("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) else: train_data,dev_data,test_data, char_to_id, tag_to_id, id_to_tag, emb_matrix = build_dataset() """ 2: 产生batch训练数据 """ train_manager = BatchManager(train_data, config.batch_size) dev_manager = BatchManager(dev_data, config.batch_size) test_manager = BatchManager(test_data, config.batch_size) model = NERLSTM_CRF(config, char_to_id, tag_to_id, emb_matrix) model.train() model.to(device) optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) """ 3: 用early stop 防止过拟合 """ total_batch = 0 dev_best_f1 = float('-inf') last_improve = 0 flag = False start_time = time.time() logger.info(" 开始训练模型 ...... ") for epoch in range(config.max_epoch): logger.info('Epoch [{}/{}]'.format(epoch + 1, config.max_epoch)) for index, batch in enumerate(train_manager.iter_batch(shuffle=True)): optimizer.zero_grad() """ 计算损失和反向传播 """ _, char_ids, seg_ids, tag_ids, mask = batch loss = model.log_likelihood(char_ids,seg_ids,tag_ids, mask) loss.backward() """ 梯度截断,最大梯度为5 """ nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip) optimizer.step() if total_batch % config.steps_check == 0: model.eval() dev_f1,dev_loss = evaluate(model, dev_manager, id_to_tag) """ 以f1作为early stop的监控指标 """ if dev_f1 > dev_best_f1: evaluate(model, test_manager, id_to_tag, test=True) dev_best_f1 = dev_f1 torch.save(model, os.path.join(config.save_dir,"medical_ner.ckpt")) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter: {} | Dev Loss: {:.4f} | Dev F1-macro: {:.4f} | Time: {} | {}' logger.info(msg.format(total_batch, dev_loss, dev_f1, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improve: """ 验证集f1超过5000batch没上升,结束训练 """ logger.info("No optimization for a long time, auto-stopping...") flag = True break if flag: break
else: return os.path.join(os.path.dirname(os.path.abspath(__file__)), file) if __name__ == '__main__': if len(sys.argv) > 1: cfgFile = sys.argv[0] else: cfgFile = 'merge.json' path = toAbspath(cfgFile) if not os.path.isfile(path): logger.error('%s is not a file' % path) sys.exit(1) with open(path, 'r', encoding='utf-8') as f: cfg = json.load(f) logger.info('merge excel') xls = ExcelOps(toAbspath(cfg['dstFile']), sheetNameOrIndex=cfg.get('dtsFileSheet', 0), keyName=cfg['keyName'], startCell=cfg.get('dtsFileStartCell', 'A1')) print(xls.dataDict) xls.merge(toAbspath(cfg['srcFile']), sheetNameOrIndex=cfg.get('srcFileSheet', 0), startCell=cfg.get('srcFileStartCell', 'A1'), forceOverWriteCols=cfg.get('forceOverWriteCols')) print(xls.dataDict) xls.flush()
async def new_transfer(request: Request) -> json_response: try: transfer = await request.json() if len(transfer['master']) < 0: return failure_response(403, 'Authorize please') if 8 > len(transfer['whom']) > 20: return failure_response(400, 'Invalid length of destination email') if transfer['amount'] is None or int(transfer['amount']) < 0: return failure_response(400, 'Invalid amount') pool = request.app['pool'] async with pool.acquire() as conn: async with conn.transaction(): users = await conn.fetch( 'SELECT * FROM users WHERE key=\'{}\''.format( transfer['master'])) if len(users) == 0: return failure_response(403, 'Authorize please') else: if users[0]['email'] == transfer['whom']: return failure_response(400, 'Not bad') if int(users[0]['balance']) < 0: return failure_response( 402, 'There is no money in your account') if users[0]['balance'] < int(transfer['amount']): return failure_response( 400, 'There is not enough money in your account') u_dests = await conn.fetch( 'SELECT * FROM users WHERE email=\'{}\''.format( transfer['whom'])) if len(u_dests) == 0: return failure_response(400, 'No such destination email') else: dest = u_dests[0] # how to get last element F**K (MAX & LIMIT didn't work) transfers = await conn.fetch( '''SELECT * FROM transfers''') t = Transfer(master=transfer['master'], amount=transfer['amount'], whom=transfer['whom'], prev=transfer_record_to_json_string( transfers[-1])) await conn.fetch(''' INSERT INTO transfers (transfer_hash, master, whom, amount, time) VALUES (\'{}\', \'{}\', \'{}\', {}, \'{}\') '''.format(t.transfer_hash, users[0]['email'], t.whom, t.amount, t.time)) await conn.fetch( 'UPDATE users SET balance={} WHERE email=\'{}\''. format((users[0]['balance'] - int(transfer['amount'])), users[0]['email'])) await conn.fetch( 'UPDATE users SET balance={} WHERE email=\'{}\''. format((dest['balance'] + int(transfer['amount'])), dest['email'])) logger.info('New Transfer {} send {} to {}'.format( users[0]['email'], transfer['amount'], transfer['whom'])) return success_response(200, 'ok') except Exception as e: return server_error_response(e)
def cpu_predict_file_content(): def predict_text(model, input_str): if not input_str: input_str = input("请输入文本: ") _, char_ids, seg_ids, _ = prepare_dataset([input_str], char_to_id, tag_to_id, test=True)[0] char_tensor = torch.LongTensor(char_ids).view(1, -1) seg_tensor = torch.LongTensor(seg_ids).view(1, -1) with torch.no_grad(): """ 得到维特比解码后的路径,并转换为标签 """ paths = model(char_tensor, seg_tensor) tags = [id_to_tag[idx] for idx in paths[0]] return result_to_json(input_str, tags) with open(config.data_proc_file, "rb") as f: train_data, dev_data, test_data = pickle.load(f) char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) emb_matrix = pickle.load(f) with open(config.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) device = torch.device("cuda" if None else "cpu") model = NERLSTM_CRF(config, char_to_id, tag_to_id, emb_matrix, device) state_dict = torch.load(os.path.join(config.save_dir, "medical_ner.ckpt"), map_location="cpu") model.load_state_dict(state_dict) """ 用cpu预测 """ model.eval() file_all = [] c_root = "/home/demo1/guoxin/chinese/" for file in os.listdir(c_root): #if "txtoriginal.txt" in file: file_all.append(file) path = "/home/demo1/ner1.csv" i = 1 word_set = set() for file in file_all: fp = open(c_root + file, 'r', encoding='utf8') i += 1 print(i) for line in fp: for ele in line.split("。"): if len(ele) < 3: continue results = predict_text(model, ele) for result in results["entities"]: word_set.add(result["word"]) if i > 10000: break logger.info("i-{0}".format(i)) import pandas as pd df = pd.DataFrame({"key": list(word_set)}) df.to_csv(path, header=False, encoding='utf_8_sig', index=False)
def train(): #load the dataset train_data, dev_data, test_data, char_to_id, id_to_char, tag_to_id, id_to_tag, emb_matrix = build_dataset( ) logger.info("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) #Batch train_manager = BatchManager(train_data, config.batch_size) dev_manager = BatchManager(dev_data, config.batch_size) test_manager = BatchManager(test_data, config.batch_size) model = NERLSTM_CRF(config, char_to_id, tag_to_id, emb_matrix) model.train() model.to(device) optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) #lr:learning rate #weight decay:L2 penalty #early stop total_batch = 0 dev_best_f1 = float('-inf') last_improve = 0 flag = False start_time = time.time() logger.info(" Start Training ...... ") for epoch in range(config.max_epoch): logger.info('Epoch [{}/{}]'.format(epoch + 1, config.max_epoch)) for index, batch in enumerate(train_manager.iter_batch(shuffle=True)): optimizer.zero_grad() #initialize the gradient """"Loss and backward propagation""" _, char_ids, len_ids, tag_ids, mask = batch loss = model.log_likelihood(char_ids, len_ids, tag_ids, mask) loss.backward() """ Gradient Clip Maximum: 5 """ nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip) optimizer.step() #Check model after certain number of batchs if total_batch % config.steps_check == 0: model.eval() dev_f1, dev_loss = evaluate(model, dev_manager, id_to_tag) """ check f1 value for early stop """ if dev_f1 > dev_best_f1: evaluate(model, test_manager, id_to_tag, test=True) dev_best_f1 = dev_f1 torch.save( model, os.path.join(config.save_dir, "medical_ner.ckpt")) improve = '*' last_improve = total_batch else: improve = '' time_used = training_timer(start_time) msg = 'Iter: {} | Dev Loss: {:.4f} | Dev F1-macro: {:.4f} | Time: {} | {}' logger.info( msg.format(total_batch, dev_loss, dev_f1, time_used, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improve: """if the f1 on dev dataset does not imporve for more than 5000batches, stop the train""" logger.info( "No optimization for a long time, auto-stopping...") flag = True break if flag: break
def build_dataset(): train_sentences = load_sentences(config.train_file, config.lower, config.zero) dev_sentences = load_sentences(config.dev_file, config.lower, config.zero) test_sentences = load_sentences(config.test_file, config.lower, config.zero) logger.info("成功读取标注好的数据") update_tag_scheme(train_sentences, config.tag_schema) update_tag_scheme(test_sentences, config.tag_schema) update_tag_scheme(dev_sentences, config.tag_schema) logger.info("成功将IOB格式转化为IOBES格式") if not os.path.isfile(config.map_file): char_to_id, id_to_char, tag_to_id, id_to_tag = create_maps( train_sentences) logger.info("根据训练集建立字典完毕") else: with open(config.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) logger.info("已有字典文件,加载完毕") emb_matrix = load_emb_matrix(char_to_id) logger.info("加载预训练的字向量完毕") train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, config.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, config.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, config.lower) logger.info("把样本和标签处理为id完毕") logger.info("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) with open(config.data_proc_file, "wb") as f: pickle.dump([train_data, dev_data, test_data], f) pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) pickle.dump(emb_matrix, f) return train_data, dev_data, test_data, char_to_id, tag_to_id, id_to_tag, emb_matrix
async def on_ready(self): """Log information about bot launching.""" logger.info("Bot %s connected on %s servers", self.user.name, len(self.guilds))
import asyncio from aiohttp import web from app import init_app from logs.logger import logger, log_format if __name__ == '__main__': try: loop = asyncio.get_event_loop() app = loop.run_until_complete(init_app()) logger.info('Server started') web.run_app(app, access_log=logger, access_log_format=log_format) logger.info('Server was stopped') except Exception as e: logger.warning(e)
async def on_ready(): logger.info('Bot logged in as {0.user}'.format(client)) channel = client.get_channel(700066521317113886) logger.info("Connected to %s", channel) while True: await check_csgo_rss(channel)