Esempio n. 1
0
def preprocess_raw_data(raw_data, tokenized_data):
    if not os.path.exists(raw_data):
        print("数据集不存在,请添加数据集")
        exit(0)

    pairs = []
    count = 0
    config = get_config()

    with open(raw_data, encoding="utf-8") as file:
        pair = ""
        for line in file:
            line = line.strip("\n").replace('/', '')

            if line == "":
                pairs.append(pair)
                count += 1
                if count % 10000 == 0:
                    print('已读取:', count, '轮问答对')
                pair = ""
                continue
            elif len(pair) == 0:
                pair = config["cls_token"] + line + config["sep_token"]
            else:
                pair = pair + line + config["sep_token"]

    print("数据读取完毕,正在处理中...")

    with open(tokenized_data, 'w', encoding="utf-8") as file:
        for i in range(len(pairs)):
            file.write(" ".join(jieba.cut(pairs[i])) + "\n")
            if i % 10000 == 0:
                print(len(range(len(pairs))), '处理进度:', i)
Esempio n. 2
0
def say(respuesta):
    print respuesta
    command = 'sudo  pico2wave -w temp/out.wav -l es-ES "' + respuesta + '"'
    os.system(command)
    os.system("aplay temp/out.wav")
    if get_config.get_config()["modo"] == "texto":
        bot.sendMessage("351857770", respuesta)
Esempio n. 3
0
def main(_):
    this_dir = os.path.dirname(os.path.realpath(__file__))
    if FLAGS.seq_id == 'None':
        seq = [[FLAGS.config_id, 1]]
    else:
        seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r'))
        seq = seqs[FLAGS.seq_id]
    print(seq)
    summaries = []
    for config_id, num_trials in seq:
        if config_id == "None":
            config = get_config(FLAGS.__flags, {})
        else:
            # TODO : create config file (.json)
            configs_path = os.path.join(this_dir,
                                        "configs%s" % FLAGS.config_ext)
            config = get_config_from_file(FLAGS.__flags, configs_path,
                                          config_id)
        print("=" * 80)
        print("Config ID {}, task {}, {} trials".format(
            config.config_id, config.task, num_trials))
        summary = _main(config, num_trials)
        summaries.append(summary)

    print("=" * 80)
    print("SUMMARY")
    for summary in summaries:
        print(summary)
Esempio n. 4
0
def load_data(tokenized_data, dict_fn, num_sample=0):
    if not os.path.exists(tokenized_data):
        print("没有检测到分词数据集,请先执行pre_treat模式")
        exit(0)

    with open(tokenized_data, 'r', encoding="utf-8") as file:
        lines = file.read().strip().split("\n")

        if num_sample == 0:
            sentences = [line for line in lines]
        else:
            sentences = [line for line in lines[:num_sample]]

    config = get_config()
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='', oov_token=config["unk_token"])
    tokenizer.fit_on_texts(sentences)
    input_tensor = tokenizer.texts_to_sequences(sentences)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        input_tensor, maxlen=config["max_length"], padding="post")

    with open(dict_fn, "w", encoding="utf-8") as file:
        file.write(
            json.dumps(tokenizer.word_index, indent=4, ensure_ascii=False))

    dataset = tf.data.Dataset.from_tensor_slices(input_tensor).cache().shuffle(
        config["buffer_size"]).prefetch(tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(config["batch_size"], drop_remainder=True)

    steps_per_epoch = len(input_tensor) // config["batch_size"]

    return dataset, tokenizer, steps_per_epoch
Esempio n. 5
0
File: main.py Progetto: eunchung/qrn
def main(_):
    this_dir = os.path.dirname(os.path.realpath(__file__))
    if FLAGS.seq_id == 'None':
        seq = [[FLAGS.config_id, 1]]
    else:
        seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r'))
        seq = seqs[FLAGS.seq_id]
    print(seq)
    summaries = []
    for config_id, num_trials in seq:
        if config_id == "None":
            config = get_config(FLAGS.__flags, {})
        else:
            # TODO : create config file (.json)
            configs_path = os.path.join(this_dir, "configs%s" % FLAGS.config_ext)
            config = get_config_from_file(FLAGS.__flags, configs_path, config_id)
        print("=" * 80)
        print("Config ID {}, task {}, {} trials".format(config.config_id, config.task, num_trials))
        summary = _main(config, num_trials)
        summaries.append(summary)

    print("=" * 80)
    print("SUMMARY")
    for summary in summaries:
        print(summary)
Esempio n. 6
0
File: main.py Progetto: yucoian/qrn
def main(_):
    this_dir = os.path.dirname(os.path.realpath(__file__))
    if FLAGS.seq_id == 'None':
        seq = [[FLAGS.config_id, FLAGS.num_trials]]
    else:
        seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r'))
        seq = seqs[FLAGS.seq_id]
    summaries = []
    for config_id, num_trials in seq:
        if config_id == "None":
            config = get_config(FLAGS.__flags, {})
        else:
            # TODO : create config file (.json)
            configs_path = os.path.join(this_dir,
                                        "configs_new%s" % FLAGS.config_ext)
            config = get_config_from_file(FLAGS.__flags, configs_path,
                                          config_id)

        if config.task == "all":
            tasks = list(map(str, range(1, 6)))
        else:
            tasks = [config.task]
        for task in tasks:
            # FIXME : this is bad way of setting task each time
            config.task = task
            print("=" * 80)
            print("Config ID {}, task {}, {} trials".format(
                config.config_id, config.task, num_trials))
            summary = _main(config, num_trials)
            summaries.append(summary)

    print("=" * 80)
    print("SUMMARY")
    for summary in summaries:
        print(summary)
Esempio n. 7
0
def main(_):
    this_dir = os.path.dirname(os.path.realpath(__file__))
    if FLAGS.seq_id == 'None':
        seq = [[FLAGS.config_id, FLAGS.num_trials]]
    else:
        seqs = json.load(open(os.path.join(this_dir, "seqs.json"), 'r'))
        seq = seqs[FLAGS.seq_id]
    summaries = []
    for config_id, num_trials in seq:
        if config_id == "None":
            config = get_config(FLAGS.__flags, {})
        else:
            # TODO : create config file (.json)
            configs_path = os.path.join(this_dir, "configs_new%s" % FLAGS.config_ext)
            config = get_config_from_file(FLAGS.__flags, configs_path, config_id)
        
        if config.task == "all":
            tasks = list(map(str, range(1, 6)))
        else:
            tasks = [config.task]
        for task in tasks:
            # FIXME : this is bad way of setting task each time
            config.task = task
            print("=" * 80)
            print("Config ID {}, task {}, {} trials".format(config.config_id, config.task, num_trials))
            summary = _main(config, num_trials)
            summaries.append(summary)
	
    print("=" * 80)
    print("SUMMARY")
    for summary in summaries:
        print(summary)
Esempio n. 8
0
def get_noticias():
  periodico = get_config.get_config()['periodico']
  api_key = " ## KEY DE LA API ## "
  if periodico == "el pais":
    link = "https://api.rss2json.com/v1/api.json?rss_url=http%3A%2F%2Fep00.epimg.net%2Frss%2Felpais%2Fportada.xml&api_key=" + api_key
  elif periodico = "el mundo": 
    link = "https://api.rss2json.com/v1/api.json?rss_url=http%3A%2F%2Festaticos.elmundo.es%2Felmundo%2Frss%2Fportada.xml&api_key=" + api_key
Esempio n. 9
0
def ask():
    modo = get_config.get_config()["modo"]
    if modo == "texto":
        bot.sendMessage("## ID DE LA CONVERSACION ##", "Di algo!")
        response = bot.getUpdates()
        num = len(response)
        print response
        while len(bot.getUpdates()) == num:
            time.sleep(3)
        print "Nuevo mensaje"
        response = bot.getUpdates()
        return response[-1]["message"]["text"]
    elif modo == "audio":
        r = sr.Recognizer()
        with sr.Microphone() as source:
            print("Di algo!")
            audio = r.listen(source)

        try:
            rec = r.recognize_google(audio, language="es-ES")
            print("Has dicho " + rec)
            return rec.lower()
        except sr.UnknownValueError:
            print("No se ha entendido el audio")
        except sr.RequestError as e:
            print("Ha habido un error con el reconocimiento de voz {0}".format(
                e))
    else:
        print "Hay un error con la configuración"
Esempio n. 10
0
def main():
    config = get_config()
    args = train_args.setup_train_args()
    if args.seed:
        train_args.set_random_seed(args)
    # 初始化tokenizer
    tokenizer = BertTokenizer(vocab_file=args.vocab_path)
    # tokenizer的字典大小
    global pad_id
    # pad_id = tokenizer.convert_tokens_to_ids(PAD)

    # 创建对话模型的输出目录
    if not os.path.exists(args.dialogue_model_output_path):
        os.mkdir(args.dialogue_model_output_path)

    # 加载GPT2模型
    model, n_ctx, optimizer = create_model(args, config)

    # 对原始数据进行预处理,将原始语料转换成对应的token_id
    # 如果当前是要训练对话生成模型
    print('开始产生token')
    # 不修改数据集的情况下,没必要每次训练都运行preprocess_raw_data 因为 生成的data是一样的
    if not os.path.exists(args.train_tokenized_path):
        file = open(args.train_tokenized_path, 'w')

    preprocess_data.preprocess_raw_data(args, tokenizer, n_ctx)
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    print('开始训练')
    train(model, args, tokenizer, optimizer, train_loss, train_accuracy)
    print('训练结束')
Esempio n. 11
0
def loss_function(real, pred):
    config = get_config()
    real = tf.reshape(real, shape=(-1, config["max_length"] - 1))
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                         reduction="none")(
                                                             real, pred)
    mask = tf.cast(tf.not_equal(real, 0), tf.float32)
    loss = tf.multiply(loss, mask)
    return tf.reduce_mean(loss)
Esempio n. 12
0
def response(sentence):
    inputs = " ".join(jieba.cut("cls" + sentence + "sep"))
    config = get_config()
    tokenizer = data_utils.load_dict(config["gpt2_dict"])
    inputs = [tokenizer.get(i, 1) for i in inputs.split(' ')]
    # inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=config["max_length"], padding="post")
    inputs = tf.convert_to_tensor(inputs)
    inputs = tf.cast(tf.expand_dims(inputs, axis=0), dtype=tf.int64)

    checkpoint_dir = config["checkpoint_dir"]
    model = gpt2.gpt2(vocab_size=config["vocab_size"],
                      num_layers=config["num_layers"],
                      units=config["units"],
                      deep=config["deep"],
                      num_heads=config["num_heads"],
                      dropout=config["dropout"])

    learning_rate = gpt2.CustomSchedule(config["deep"])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir, exist_ok=True)
    if os.listdir(checkpoint_dir):
        checkpoint.restore(
            tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()

    result = []
    for _ in range(config["max_length"]):
        # print(inputs)
        # exit(0)
        predictions = model(inputs=inputs, training=False)
        predictions = tf.nn.softmax(predictions, axis=-1)
        predictions = predictions[:, -1:, :]
        predictions = tf.squeeze(predictions, axis=1)
        # print(predictions)
        # exit(0)
        pred = tf.argmax(input=predictions, axis=-1)
        print(inputs)
        print(pred)
        # exit(0)
        if pred.numpy()[0] == 2:
            break
        result.append(pred.numpy()[0])

        inputs = tf.concat([inputs, tf.expand_dims(pred, axis=0)], axis=-1)
        print(inputs)
    print(result)
    return data_utils.sequences_to_texts(result, tokenizer)
Esempio n. 13
0
 def __init__(self):
     self._config = get_config()['urls']
     self.TREND = self._config['TREND']
     self.CUSTOM = self._config['CUSTOM']
Esempio n. 14
0
def train():
    print('训练开始,正在准备数据中...')

    config = get_config()
    dataset, tokenizer, steps_per_epoch = data_utils.load_data(
        tokenized_data=config["tokenized_corpus"],
        dict_fn=config["gpt2_dict"],
        num_sample=35)
    train_loss = tf.keras.metrics.Mean(name="train_loss")
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name="train_accuracy")
    learning_rate = gpt2.CustomSchedule(config["deep"])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)

    model = gpt2.gpt2(vocab_size=config["vocab_size"],
                      num_layers=config["num_layers"],
                      units=config["units"],
                      deep=config["deep"],
                      num_heads=config["num_heads"],
                      dropout=config["dropout"])

    checkpoint_dir = config["checkpoint_dir"]
    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir, exist_ok=True)
    if os.listdir(checkpoint_dir):
        checkpoint.restore(
            tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()

    for epoch in range(config["epochs"]):
        print('Epoch {}/{}'.format(epoch + 1, config["epochs"]))
        start_time = time.time()
        train_loss.reset_states()
        train_accuracy.reset_states()

        step_loss = 0
        batch_sum = 0
        sample_sum = 0

        for batch, inputs in enumerate(dataset.take(steps_per_epoch)):
            inputs_real = inputs[:, 1:]
            inputs = inputs[:, :-1]
            with tf.GradientTape() as tape:
                predictions = model(inputs)
                loss = loss_function(inputs_real, predictions)
            gradient = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradient, model.trainable_variables))

            train_loss(loss)
            train_accuracy(inputs_real, predictions)
            step_loss = train_loss.result()
            batch_sum = batch_sum + len(inputs)
            sample_sum = steps_per_epoch * len(inputs)
            print('\r',
                  '{}/{} [==================================]'.format(
                      batch_sum, sample_sum),
                  end='',
                  flush=True)
        step_time = (time.time() - start_time)
        sys.stdout.write(' - {:.4f}s/step - loss: {:.4f}\n'.format(
            step_time, step_loss))
        sys.stdout.flush()
        checkpoint.save(file_prefix=os.path.join(checkpoint_dir, "ckpt"))
    print("训练结束")
Esempio n. 15
0
# -*- coding: utf-8 -*-
# Modulo que utiliza la API de OpenWeatherMap para devolver información acerce del clima epor ciudades

import json
import urllib
import mtime
from config import get_config

city = get_config.get_config()["ciudad"]

def get_ciudad_from_frase(frase):
  i = 0
  frase = frase.split(" ")
  for palabra in frase:
    if palabra == "en" or palabra == "de":
      return frase[i+1]
    i += 1
    
def traducir_weather(frase):
  return {"mist":"nublado" , "scattered clouds":"nubes dispersas", "broken clouds":"nubes rotas", "clear sky":"cielo claro", "few clouds":"pocas nubes", "shower rain":"aguaceros", "rain":"lluvia", "thunderstorm":"tormenta eléctrica", "drizzle":"llovizna", "sleet":"aguanieve", "clouds":"nublado" }[frase]
    
def get_tiempo_from_frase(frase):
  frase = frase.split(" ")
  dia_actual = mtime.current_day()
  hora_actual = mtime.current_hour()
  
  
def get_clima_actual(frase):
  ciudad = get_ciudad_from_frase(frase)
  if ciudad == None:
    ciudad = city
Esempio n. 16
0
File: main.py Progetto: aDummy/Lola
			return mcalculator.raiz(q)
		else:
			return "No he entendido la operación, ¿puedes repetirla, por favor?"
	
	if "que es" in q or "quien es" in q or "quien fue" in q:
		return mwiki.buscar(q)
	
	if "noticias" in q:
		return mnoticias.get_noticias()
	
	if "cambiar configuracion" in q:
		return change_config.change_config()

	if "cambiar" in q and "modo" in q:
		return change_config.change_mode()
	
	else:
		return "Perdona, no te he entendido"


while 1:

	if get_config.get_config()["modo"] == "audio":
        hey = input.ask()
        while( "hola %s" % (get_config.get_oonfig()["asistente"]) not in hey):
            hey = telegram_input.ask()

	#print "beeeep"	
	question = input.ask().lower()
	output.say(search(question))
Esempio n. 17
0
# -*- coding: utf-8 -*-
# Módulo que responde a despedirse, eligiendo una respuesta eleatoria entre posibilidades que cambian según la hora del día

import datetime
from config import get_config
from random import randint

nombre = get_config.get_config()["nombre"]

farewells = [
    "Adios!", "Hasta otra!",
    "Que vaya bien, %s ." % (nombre),
    "Vale, %s , hablamos luego" % (nombre)
]

hora = datetime.datetime.now().hour

if hora >= 6 and hora < 12:
    farewells.extend(["Que tengas un buen dia!"])
elif hora >= 12 and hora < 20:
    farewells.extend(
        ["Adios, %s. que acabes de pasar bien la tarde!" % (nombre)])
else:
    farewells.extend(["Buenas noches, " + nombre])


def farewell():
    return farewells[randint(0, len(farewells) - 1)]
Esempio n. 18
0
import pandas as pd

from cleaning_data.cleaning_data import CleaningData
from data_to_excel.data_to_excel import DataToExcel
from config.get_config import get_config
from parse_pdf.parse_pdf import ParsePdf


config = get_config()

contents = ParsePdf(
    config['to_delete']
).get_pdf_data(
    config['pdf_filename'], config['page']
)

new_content = CleaningData().clean(contents)

data_to_excel = DataToExcel(new_content)
data_to_excel.to_excel()

final_result = data_to_excel.final_data

df = pd.DataFrame(final_result)
df.to_csv(
    config['output_filename'],
    index=False,
    columns=
    [
        'Session Title', 'Title', 'Position', 'First Name',
        'Middle Name', 'Last Name', 'Workplace'
Esempio n. 19
0
from config.get_config import get_config
from logger.logging import Loader

config = get_config(Loader)
Esempio n. 20
0
 def __init__(self):
     self.driver = webdriver.Chrome()
     self.config = get_config()
     self.scrapy_config = self.config['scrapy']
Esempio n. 21
0
import comet_ml, os, sys, torch
from data import load_data
from lib.model import Model
from util import Logger, train, validation, AdamOptimizer

# load the configuration files
config_name = None

from config.get_config import get_config
model_config, data_config, exp_config = get_config(config_name)

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=str(exp_config['device'])
torch.cuda.set_device(0)

# data
train_data, val_data = load_data(data_config, exp_config['batch_size'])
eval_length = data_config['eval_length']
train_epoch_size = data_config['train_epoch_size']
val_epoch_size = data_config['val_epoch_size']

# model
model = Model(**model_config).to(0)

# optimizer
optimizer = AdamOptimizer(params=model.parameters(), lr=exp_config['lr'],
                          grad_clip_value=exp_config['grad_clip_value'],
                          grad_clip_norm=exp_config['grad_clip_norm'])

logger_on = True