def preprocess_raw_data(raw_data, tokenized_data): if not os.path.exists(raw_data): print("数据集不存在,请添加数据集") exit(0) pairs = [] count = 0 config = get_config() with open(raw_data, encoding="utf-8") as file: pair = "" for line in file: line = line.strip("\n").replace('/', '') if line == "": pairs.append(pair) count += 1 if count % 10000 == 0: print('已读取:', count, '轮问答对') pair = "" continue elif len(pair) == 0: pair = config["cls_token"] + line + config["sep_token"] else: pair = pair + line + config["sep_token"] print("数据读取完毕,正在处理中...") with open(tokenized_data, 'w', encoding="utf-8") as file: for i in range(len(pairs)): file.write(" ".join(jieba.cut(pairs[i])) + "\n") if i % 10000 == 0: print(len(range(len(pairs))), '处理进度:', i)
def say(respuesta): print respuesta command = 'sudo pico2wave -w temp/out.wav -l es-ES "' + respuesta + '"' os.system(command) os.system("aplay temp/out.wav") if get_config.get_config()["modo"] == "texto": bot.sendMessage("351857770", respuesta)
def main(_): this_dir = os.path.dirname(os.path.realpath(__file__)) if FLAGS.seq_id == 'None': seq = [[FLAGS.config_id, 1]] else: seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r')) seq = seqs[FLAGS.seq_id] print(seq) summaries = [] for config_id, num_trials in seq: if config_id == "None": config = get_config(FLAGS.__flags, {}) else: # TODO : create config file (.json) configs_path = os.path.join(this_dir, "configs%s" % FLAGS.config_ext) config = get_config_from_file(FLAGS.__flags, configs_path, config_id) print("=" * 80) print("Config ID {}, task {}, {} trials".format( config.config_id, config.task, num_trials)) summary = _main(config, num_trials) summaries.append(summary) print("=" * 80) print("SUMMARY") for summary in summaries: print(summary)
def load_data(tokenized_data, dict_fn, num_sample=0): if not os.path.exists(tokenized_data): print("没有检测到分词数据集,请先执行pre_treat模式") exit(0) with open(tokenized_data, 'r', encoding="utf-8") as file: lines = file.read().strip().split("\n") if num_sample == 0: sentences = [line for line in lines] else: sentences = [line for line in lines[:num_sample]] config = get_config() tokenizer = tf.keras.preprocessing.text.Tokenizer( filters='', oov_token=config["unk_token"]) tokenizer.fit_on_texts(sentences) input_tensor = tokenizer.texts_to_sequences(sentences) input_tensor = tf.keras.preprocessing.sequence.pad_sequences( input_tensor, maxlen=config["max_length"], padding="post") with open(dict_fn, "w", encoding="utf-8") as file: file.write( json.dumps(tokenizer.word_index, indent=4, ensure_ascii=False)) dataset = tf.data.Dataset.from_tensor_slices(input_tensor).cache().shuffle( config["buffer_size"]).prefetch(tf.data.experimental.AUTOTUNE) dataset = dataset.batch(config["batch_size"], drop_remainder=True) steps_per_epoch = len(input_tensor) // config["batch_size"] return dataset, tokenizer, steps_per_epoch
def main(_): this_dir = os.path.dirname(os.path.realpath(__file__)) if FLAGS.seq_id == 'None': seq = [[FLAGS.config_id, 1]] else: seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r')) seq = seqs[FLAGS.seq_id] print(seq) summaries = [] for config_id, num_trials in seq: if config_id == "None": config = get_config(FLAGS.__flags, {}) else: # TODO : create config file (.json) configs_path = os.path.join(this_dir, "configs%s" % FLAGS.config_ext) config = get_config_from_file(FLAGS.__flags, configs_path, config_id) print("=" * 80) print("Config ID {}, task {}, {} trials".format(config.config_id, config.task, num_trials)) summary = _main(config, num_trials) summaries.append(summary) print("=" * 80) print("SUMMARY") for summary in summaries: print(summary)
def main(_): this_dir = os.path.dirname(os.path.realpath(__file__)) if FLAGS.seq_id == 'None': seq = [[FLAGS.config_id, FLAGS.num_trials]] else: seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r')) seq = seqs[FLAGS.seq_id] summaries = [] for config_id, num_trials in seq: if config_id == "None": config = get_config(FLAGS.__flags, {}) else: # TODO : create config file (.json) configs_path = os.path.join(this_dir, "configs_new%s" % FLAGS.config_ext) config = get_config_from_file(FLAGS.__flags, configs_path, config_id) if config.task == "all": tasks = list(map(str, range(1, 6))) else: tasks = [config.task] for task in tasks: # FIXME : this is bad way of setting task each time config.task = task print("=" * 80) print("Config ID {}, task {}, {} trials".format( config.config_id, config.task, num_trials)) summary = _main(config, num_trials) summaries.append(summary) print("=" * 80) print("SUMMARY") for summary in summaries: print(summary)
def main(_): this_dir = os.path.dirname(os.path.realpath(__file__)) if FLAGS.seq_id == 'None': seq = [[FLAGS.config_id, FLAGS.num_trials]] else: seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r')) seq = seqs[FLAGS.seq_id] summaries = [] for config_id, num_trials in seq: if config_id == "None": config = get_config(FLAGS.__flags, {}) else: # TODO : create config file (.json) configs_path = os.path.join(this_dir, "configs_new%s" % FLAGS.config_ext) config = get_config_from_file(FLAGS.__flags, configs_path, config_id) if config.task == "all": tasks = list(map(str, range(1, 6))) else: tasks = [config.task] for task in tasks: # FIXME : this is bad way of setting task each time config.task = task print("=" * 80) print("Config ID {}, task {}, {} trials".format(config.config_id, config.task, num_trials)) summary = _main(config, num_trials) summaries.append(summary) print("=" * 80) print("SUMMARY") for summary in summaries: print(summary)
def get_noticias(): periodico = get_config.get_config()['periodico'] api_key = " ## KEY DE LA API ## " if periodico == "el pais": link = "https://api.rss2json.com/v1/api.json?rss_url=http%3A%2F%2Fep00.epimg.net%2Frss%2Felpais%2Fportada.xml&api_key=" + api_key elif periodico = "el mundo": link = "https://api.rss2json.com/v1/api.json?rss_url=http%3A%2F%2Festaticos.elmundo.es%2Felmundo%2Frss%2Fportada.xml&api_key=" + api_key
def ask(): modo = get_config.get_config()["modo"] if modo == "texto": bot.sendMessage("## ID DE LA CONVERSACION ##", "Di algo!") response = bot.getUpdates() num = len(response) print response while len(bot.getUpdates()) == num: time.sleep(3) print "Nuevo mensaje" response = bot.getUpdates() return response[-1]["message"]["text"] elif modo == "audio": r = sr.Recognizer() with sr.Microphone() as source: print("Di algo!") audio = r.listen(source) try: rec = r.recognize_google(audio, language="es-ES") print("Has dicho " + rec) return rec.lower() except sr.UnknownValueError: print("No se ha entendido el audio") except sr.RequestError as e: print("Ha habido un error con el reconocimiento de voz {0}".format( e)) else: print "Hay un error con la configuración"
def main(): config = get_config() args = train_args.setup_train_args() if args.seed: train_args.set_random_seed(args) # 初始化tokenizer tokenizer = BertTokenizer(vocab_file=args.vocab_path) # tokenizer的字典大小 global pad_id # pad_id = tokenizer.convert_tokens_to_ids(PAD) # 创建对话模型的输出目录 if not os.path.exists(args.dialogue_model_output_path): os.mkdir(args.dialogue_model_output_path) # 加载GPT2模型 model, n_ctx, optimizer = create_model(args, config) # 对原始数据进行预处理,将原始语料转换成对应的token_id # 如果当前是要训练对话生成模型 print('开始产生token') # 不修改数据集的情况下,没必要每次训练都运行preprocess_raw_data 因为 生成的data是一样的 if not os.path.exists(args.train_tokenized_path): file = open(args.train_tokenized_path, 'w') preprocess_data.preprocess_raw_data(args, tokenizer, n_ctx) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') print('开始训练') train(model, args, tokenizer, optimizer, train_loss, train_accuracy) print('训练结束')
def loss_function(real, pred): config = get_config() real = tf.reshape(real, shape=(-1, config["max_length"] - 1)) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")( real, pred) mask = tf.cast(tf.not_equal(real, 0), tf.float32) loss = tf.multiply(loss, mask) return tf.reduce_mean(loss)
def response(sentence): inputs = " ".join(jieba.cut("cls" + sentence + "sep")) config = get_config() tokenizer = data_utils.load_dict(config["gpt2_dict"]) inputs = [tokenizer.get(i, 1) for i in inputs.split(' ')] # inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=config["max_length"], padding="post") inputs = tf.convert_to_tensor(inputs) inputs = tf.cast(tf.expand_dims(inputs, axis=0), dtype=tf.int64) checkpoint_dir = config["checkpoint_dir"] model = gpt2.gpt2(vocab_size=config["vocab_size"], num_layers=config["num_layers"], units=config["units"], deep=config["deep"], num_heads=config["num_heads"], dropout=config["dropout"]) learning_rate = gpt2.CustomSchedule(config["deep"]) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir, exist_ok=True) if os.listdir(checkpoint_dir): checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() result = [] for _ in range(config["max_length"]): # print(inputs) # exit(0) predictions = model(inputs=inputs, training=False) predictions = tf.nn.softmax(predictions, axis=-1) predictions = predictions[:, -1:, :] predictions = tf.squeeze(predictions, axis=1) # print(predictions) # exit(0) pred = tf.argmax(input=predictions, axis=-1) print(inputs) print(pred) # exit(0) if pred.numpy()[0] == 2: break result.append(pred.numpy()[0]) inputs = tf.concat([inputs, tf.expand_dims(pred, axis=0)], axis=-1) print(inputs) print(result) return data_utils.sequences_to_texts(result, tokenizer)
def __init__(self): self._config = get_config()['urls'] self.TREND = self._config['TREND'] self.CUSTOM = self._config['CUSTOM']
def train(): print('训练开始,正在准备数据中...') config = get_config() dataset, tokenizer, steps_per_epoch = data_utils.load_data( tokenized_data=config["tokenized_corpus"], dict_fn=config["gpt2_dict"], num_sample=35) train_loss = tf.keras.metrics.Mean(name="train_loss") train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name="train_accuracy") learning_rate = gpt2.CustomSchedule(config["deep"]) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) model = gpt2.gpt2(vocab_size=config["vocab_size"], num_layers=config["num_layers"], units=config["units"], deep=config["deep"], num_heads=config["num_heads"], dropout=config["dropout"]) checkpoint_dir = config["checkpoint_dir"] checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir, exist_ok=True) if os.listdir(checkpoint_dir): checkpoint.restore( tf.train.latest_checkpoint(checkpoint_dir)).expect_partial() for epoch in range(config["epochs"]): print('Epoch {}/{}'.format(epoch + 1, config["epochs"])) start_time = time.time() train_loss.reset_states() train_accuracy.reset_states() step_loss = 0 batch_sum = 0 sample_sum = 0 for batch, inputs in enumerate(dataset.take(steps_per_epoch)): inputs_real = inputs[:, 1:] inputs = inputs[:, :-1] with tf.GradientTape() as tape: predictions = model(inputs) loss = loss_function(inputs_real, predictions) gradient = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradient, model.trainable_variables)) train_loss(loss) train_accuracy(inputs_real, predictions) step_loss = train_loss.result() batch_sum = batch_sum + len(inputs) sample_sum = steps_per_epoch * len(inputs) print('\r', '{}/{} [==================================]'.format( batch_sum, sample_sum), end='', flush=True) step_time = (time.time() - start_time) sys.stdout.write(' - {:.4f}s/step - loss: {:.4f}\n'.format( step_time, step_loss)) sys.stdout.flush() checkpoint.save(file_prefix=os.path.join(checkpoint_dir, "ckpt")) print("训练结束")
# -*- coding: utf-8 -*- # Modulo que utiliza la API de OpenWeatherMap para devolver información acerce del clima epor ciudades import json import urllib import mtime from config import get_config city = get_config.get_config()["ciudad"] def get_ciudad_from_frase(frase): i = 0 frase = frase.split(" ") for palabra in frase: if palabra == "en" or palabra == "de": return frase[i+1] i += 1 def traducir_weather(frase): return {"mist":"nublado" , "scattered clouds":"nubes dispersas", "broken clouds":"nubes rotas", "clear sky":"cielo claro", "few clouds":"pocas nubes", "shower rain":"aguaceros", "rain":"lluvia", "thunderstorm":"tormenta eléctrica", "drizzle":"llovizna", "sleet":"aguanieve", "clouds":"nublado" }[frase] def get_tiempo_from_frase(frase): frase = frase.split(" ") dia_actual = mtime.current_day() hora_actual = mtime.current_hour() def get_clima_actual(frase): ciudad = get_ciudad_from_frase(frase) if ciudad == None: ciudad = city
return mcalculator.raiz(q) else: return "No he entendido la operación, ¿puedes repetirla, por favor?" if "que es" in q or "quien es" in q or "quien fue" in q: return mwiki.buscar(q) if "noticias" in q: return mnoticias.get_noticias() if "cambiar configuracion" in q: return change_config.change_config() if "cambiar" in q and "modo" in q: return change_config.change_mode() else: return "Perdona, no te he entendido" while 1: if get_config.get_config()["modo"] == "audio": hey = input.ask() while( "hola %s" % (get_config.get_oonfig()["asistente"]) not in hey): hey = telegram_input.ask() #print "beeeep" question = input.ask().lower() output.say(search(question))
# -*- coding: utf-8 -*- # Módulo que responde a despedirse, eligiendo una respuesta eleatoria entre posibilidades que cambian según la hora del día import datetime from config import get_config from random import randint nombre = get_config.get_config()["nombre"] farewells = [ "Adios!", "Hasta otra!", "Que vaya bien, %s ." % (nombre), "Vale, %s , hablamos luego" % (nombre) ] hora = datetime.datetime.now().hour if hora >= 6 and hora < 12: farewells.extend(["Que tengas un buen dia!"]) elif hora >= 12 and hora < 20: farewells.extend( ["Adios, %s. que acabes de pasar bien la tarde!" % (nombre)]) else: farewells.extend(["Buenas noches, " + nombre]) def farewell(): return farewells[randint(0, len(farewells) - 1)]
import pandas as pd from cleaning_data.cleaning_data import CleaningData from data_to_excel.data_to_excel import DataToExcel from config.get_config import get_config from parse_pdf.parse_pdf import ParsePdf config = get_config() contents = ParsePdf( config['to_delete'] ).get_pdf_data( config['pdf_filename'], config['page'] ) new_content = CleaningData().clean(contents) data_to_excel = DataToExcel(new_content) data_to_excel.to_excel() final_result = data_to_excel.final_data df = pd.DataFrame(final_result) df.to_csv( config['output_filename'], index=False, columns= [ 'Session Title', 'Title', 'Position', 'First Name', 'Middle Name', 'Last Name', 'Workplace'
from config.get_config import get_config from logger.logging import Loader config = get_config(Loader)
def __init__(self): self.driver = webdriver.Chrome() self.config = get_config() self.scrapy_config = self.config['scrapy']
import comet_ml, os, sys, torch from data import load_data from lib.model import Model from util import Logger, train, validation, AdamOptimizer # load the configuration files config_name = None from config.get_config import get_config model_config, data_config, exp_config = get_config(config_name) os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]=str(exp_config['device']) torch.cuda.set_device(0) # data train_data, val_data = load_data(data_config, exp_config['batch_size']) eval_length = data_config['eval_length'] train_epoch_size = data_config['train_epoch_size'] val_epoch_size = data_config['val_epoch_size'] # model model = Model(**model_config).to(0) # optimizer optimizer = AdamOptimizer(params=model.parameters(), lr=exp_config['lr'], grad_clip_value=exp_config['grad_clip_value'], grad_clip_norm=exp_config['grad_clip_norm']) logger_on = True