def get_leaner(use_torch_model=None): if use_torch_model == 'inceptionresnetv2': sz = 256 #image size bs = 16 #batch size arch = inceptionresnet_2 #specify target architecture md = data_util.get_data(sz, bs, *dataset_stat(), nw=4) learner = torch_model.ConvLearnerV3.pretrained(arch, md, ps=0.5) #dropout 50% learner.models_path = 'inceptionresnetv2' elif use_torch_model: sz = 512 #image size bs = 8 #batch size arch = get_torch_model(use_torch_model) md = data_util.get_data(sz, bs, *dataset_stat(), nw=4) learner = torch_model.ConvLearnerV2.pretrained(arch, md, ps=0.5, clip=0.5) #dropout 50% learner.models_path = 'models_v2' else: sz = 512 #image size bs = 8 #batch size arch = resnet50 #specify target architecture md = data_util.get_data(sz, bs, *dataset_stat(), nw=8) learner = torch_model.ConvLearner.pretrained(arch, md, ps=0.5) #dropout 50% return learner
def dataset_stat(skip_analsic=True): # train_names = list({f[:36] for f in os.listdir(data_util.TRAIN)}) label_csv = pd.read_csv(data_util.LABELS) train_names = [id for id in label_csv['Id']] test_names = list({f[:36] for f in os.listdir(data_util.TEST)}) tr_n, val_n = train_test_split(train_names, test_size=0.05, random_state=42) if not skip_analsic: bs = 16 sz = 256 md = data_util.get_data(sz, bs, tr_n, val_n, test_names) x, y = next(iter(md.trn_dl)) x.shape, y.shape # display_imgs(np.asarray(md.trn_ds.denorm(x))) x_tot = np.zeros(4) x2_tot = np.zeros(4) for x, y in iter(md.trn_dl): tmp = md.trn_ds.denorm(x).reshape(16, -1) x = md.trn_ds.denorm(x).reshape(-1, 4) x_tot += x.mean(axis=0) x2_tot += (x**2).mean(axis=0) channel_avr = x_tot / len(md.trn_dl) channel_std = np.sqrt(x2_tot / len(md.trn_dl) - channel_avr**2) print(channel_avr, channel_std) return tr_n, val_n, test_names
def create_word_lib_df(train_file_route): ''' 根据训练集建立词库 :param file_route: :return: ''' train_data = du.get_data(train_file_route) doc_list = train_data.split('\n') doc_num = len(doc_list) # 文档数 word_list = [] for doc in tqdm(doc_list): doc_word_list = doc.split('\t')[1].strip().split(' ') doc_word_only_list = list(set(doc_word_list)) word_list.extend(doc_word_only_list) word_counter = Counter(word_list) word_list = list(word_counter.keys()) word_count_list = list(word_counter.values()) idf_list = [math.log(doc_num / x, 2) for x in word_count_list] word_lib_df = pd.DataFrame(data={'word': word_list, 'idf': idf_list}) word_lib_df.set_index('word', inplace=True) word_lib_df['index'] = list(range(word_lib_df.shape[0])) word_lib_route = './data/word_lib.csv' word_lib_df.to_csv(word_lib_route)
def freeze_data(data_size=500, data_filename="phone_data"): input_name = "x" label_name = "y" np.random.seed(0) x, y = data_util.get_data("test") samples = np.random.randint(0, len(y), data_size) print("use {} data samples".format(data_size)) if not __debug__: print("use samples: {}".format(samples)) x = x[samples] y = np.argmax(y[samples], axis=1) + 1 print("{} shape: {}".format(input_name, x.shape)) print("{} shape: {}".format(label_name, y.shape)) print("save {} to text file at: {}".format(input_name, "data/data.{}.txt".format(input_name))) print("save {} to text file at: {}".format(label_name, "data/data.{}.txt".format(label_name))) np.savetxt("data/data.{}.txt".format(input_name), np.reshape(x, [data_size, np.prod(x.shape)/data_size]), '%.7e') np.savetxt("data/data.{}.txt".format(label_name), y, '%d') frozen_data_path = "data/data.pb" frozen_data_text_path = "data/data.pb.txt" input_const = tf.constant(x, dtype=tf.float32, shape=x.shape, name=input_name) label_const = tf.constant(y, dtype=tf.int32, shape=y.shape, name=label_name) graph = tf.get_default_graph() with tf.Session() as sess: sess.run(input_const) sess.run(label_const) with tf.gfile.GFile(frozen_data_path, "wb") as f: f.write(graph.as_graph_def().SerializeToString()) print("frozen {} and {} to binary file at: {}".format(input_name, label_name, frozen_data_path)) with tf.gfile.FastGFile(frozen_data_text_path, "wb") as f: f.write(str(graph.as_graph_def())) print("frozen {} and {} to text file at: {}".format(input_name, label_name, frozen_data_text_path)) with tf.gfile.GFile(frozen_data_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, input_map=None, return_elements=None, name="", op_dict=None, producer_op_list=None) session = tf.Session(graph=graph) input_op = graph.get_operation_by_name(input_name).outputs[0] label_op = graph.get_operation_by_name(label_name).outputs[0] input_op_result = session.run(input_op) label_op_result = session.run(label_op) assert input_op_result.shape == x.shape assert label_op_result.shape == y.shape assert np.allclose(x, input_op_result) assert np.allclose(y, label_op_result) data_util.zip_files("model/{}.zip".format(data_filename), "data/data.*")
def tf_idf(file_route): ''' KNN: 正确率:0.70 Macro_F1: 0.6809592503036148 Micro_F1: 0.7834627683873915 SVM: Macro_F1: 0.3034147735580599 Micro_F1: 0.7880310644129741 :param file_route: :return: ''' data = du.get_data(file_route) doc_list = data.split('\n') doc_num = len(doc_list) word_lib_df = du.get_word_lib_df() word_lib_num = word_lib_df.shape[0] label_list = [] tf_idf = [] col = [] row = [] doc_no = 0 for doc in tqdm(doc_list): label_list.append(doc.split('\t')[0]) doc_word_list = doc.split('\t')[1].strip().split(' ') doc_word_counter = Counter(doc_word_list) doc_lenth = len(doc_word_list) for word, count in doc_word_counter.items(): try: one_word_tf = count / doc_lenth one_word_idf = word_lib_df.loc[ word, 'idf'] # 对于测试集有可能出现异常,即测试集中的某个词在词库中查不到因为词库是训练集构造的 # fixme 用训练集构造的词库中报错说没有训练集第4543个doc中的nan这个词而实际上词库中有这个词 KeyError: 'the label [nan] is not in the [index]' one_word_tf_idf = one_word_tf * one_word_idf row.append(doc_no) col.append(word_lib_df.loc[word, 'index']) tf_idf.append(one_word_tf_idf) except: continue doc_no += 1 tf_idf_train_feature_sparse_matrix = csr_matrix( (tf_idf, (row, col)), shape=(doc_num, word_lib_num)) # print(len(train_label_list), train_label_list[0]) le = LabelEncoder() le.fit(label_list) # print(le.classes_) # 显示出共八种标签['acq' 'crude' 'earn' 'grain' 'interest' 'money-fx' 'ship' 'trade'] train_label_list = list(le.transform(label_list)) # print(train_label_list[0: 5]) # 显示训练集中前五个文档的被编码后的标签[2, 0, 2, 2, 2] return tf_idf_train_feature_sparse_matrix, train_label_list
def test_two(): # 考虑过拟合问题 train_data, test_data, vali_data = data_util.get_data(0.9) net = Network([784, 30, 10], cost=CrossEntropyCost) net.large_weight_initializer() net.SGD(train_data, 400, 10, 0.5, evaluation_data=test_data, monitor_training_accuracy=True, monitor_training_cost=True, monitor_evaluation_accuracy=True, monitor_evaluation_cost=True)
def test_one(): train_data, test_data, vali_data = data_util.get_data() net = Network([784, 100, 10], cost=CrossEntropyCost) net.large_weight_initializer() train_cost, train_accuracy, evaluation_cost, evaluation_accuracy = net.SGD( train_data, 30, 10, 1, evaluation_data=test_data, monitor_training_cost=True, monitor_training_accuracy=True, monitor_evaluation_cost=True, monitor_evaluation_accuracy=True) print('=========') for cost, accuracy in zip(train_cost, train_accuracy): print('训练集=== cost:', cost, ' accuracy:', accuracy) print('=========') for cost, accuracy in zip(evaluation_cost, evaluation_accuracy): print('测试集=== cost:', cost, ' accuracy:', accuracy)
def test_three(): train_data, test_data, vali_data = data_util.get_data() net = Network([784, 30, 10], cost=CrossEntropyCost) net.large_weight_initializer() train_cost, train_accuracy, evaluation_cost, evaluation_accuracy = net.SGD( train_data, 30, 10, 0.5, lmbda=5, evaluation_data=test_data, monitor_training_accuracy=True, monitor_training_cost=True, monitor_evaluation_accuracy=True, monitor_evaluation_cost=True) # 绘制训练集测试集代价函数曲线 train_round = [x for x in range(len(train_cost))] evaluation_round = [x for x in range(len(evaluation_cost))] draw_accuracy(train_round, train_accuracy, evaluation_round, evaluation_accuracy) draw_cost(train_round, train_cost, evaluation_round, evaluation_cost)
情感分析例子,利用LM+P-tuning """ from data_util import get_data from gpt_model import get_model from loss_evaluate import get_evaluator from config import * task_name = TaskName.CECMMNT task_name = TaskName.BUSTM task_name = TaskName.OCNLI # task_name = TaskName.OCEMOTION task_name = TaskName.CSLDCP current_task_name = task_name # 获取任务数据集 train_generator, valid_generator, test_generator, labels = get_data(task_name) # 获取任务模型 model = get_model() # 获取任务evaluator evaluator = get_evaluator(task_name, valid_generator, test_generator, labels) # 训练模型 model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator) * 50, epochs=100, callbacks=[evaluator])
parser.add_argument("--unit", type=int, default=32, help="hidden unit of the LSTM model") parser.add_argument("--epochs", type=int, default=3000, help="training epochs of the LSTM model") args = parser.parse_args() data_util.maybe_prepare_data() print("Begin training model({} layer, {} hidden unit, {} training epochs)..." .format(args.layer, args.unit, args.epochs)) init_time = time.time() # ----------------------------- # step1: Prepare data # ----------------------------- x_train, y_train = data_util.get_data("train") x_test, y_test = data_util.get_data("test") # ----------------------------------- # step2: Define parameters for model # ----------------------------------- config = Config(x_train, args.layer, args.unit, args.epochs) # ------------------------------------------------------ # step3: Build the neural network # ------------------------------------------------------ X = tf.placeholder(tf.float32, [None, config.time_steps, config.input_dim], name="input") Y = tf.placeholder(tf.float32, [None, config.num_classes], name="label") label_prob = lstm_net(X, config)
default=32, help="hidden unit of the LSTM model") args = parser.parse_args() frozen_model = "data/{}layer{}unit.pb".format(args.layer, args.unit) if os.path.isfile(frozen_model): print("using frozen model: {}".format(frozen_model)) else: print("model: {} not exist!".format(frozen_model)) exit(1) init_time = time.time() # ----------------------------- # step1: load and prepare data # ----------------------------- x_test, y_test = data_util.get_data("test") # np.random.seed(0) # sample_index = np.random.randint(len(y_test), size=sample_size) sample_index = np.arange(len(y_test)) x_test_sample = x_test[sample_index] y_test_sample = y_test[sample_index] # print("x_sample:{}, y_sample:{}".format(x_test_sample.shape, y_test_sample.shape)) init_end_time = time.time() print("loading data takes {:6.4f} ms".format( (init_end_time - init_time) * 1000)) print("predicting cases:") with tf.gfile.GFile(frozen_model, "rb") as f: graph_def = tf.GraphDef()
avg_p = float(sum(p)) / len(p) avg_r = float(sum(r)) / len(r) avg_f = float(sum(f)) / len(f) avg_a = float(sum(a)) / len(a) return avg_p, avg_r, avg_f, avg_a MAX_SEQUENCE_LENGTH = 50 HIDDEN_SIZE = 200 MAX_SENTENCE = 30 EMBEDDING_DIM = 100 TIMESTAMP_1 = MAX_SEQUENCE_LENGTH TIMESTAMP_2 = MAX_SENTENCE X_test, y = get_data('test') X = deepcopy(X_test) embedding_matrix, len_word_index = build_embeddingmatrix(X_test) embedding_layer = Embedding(len_word_index + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) main_input = Input(shape=(MAX_SENTENCE, MAX_SEQUENCE_LENGTH), dtype='float32', name="main_input") sequence_input = TimeDistributed(embedding_layer, name="sequence_input")(main_input) gru = GRU(HIDDEN_SIZE,
padding='post', truncating='post', value=0) X[index] = data return numpy.array(X), numpy.array(yi), numpy.array(y) MAX_SEQUENCE_LENGTH = 50 HIDDEN_SIZE = 200 MAX_SENTENCE = 30 EMBEDDING_DIM = 100 TIMESTAMP_1 = MAX_SEQUENCE_LENGTH TIMESTAMP_2 = MAX_SENTENCE X_train, y = get_data('train') yi = deepcopy(y) y_train = [] for i in range(len(y)): y_train.append(to_categorical(y[i], 2)) word_index = get_word2id(X_train, 'train') embedding_matrix = build_embeddingmatrix(word_index) X_train, yi, y_train = vectorize(X_train, yi, y_train, word_index) embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,
def tf_dc(file_route): ''' KNN: 正确率:0.787 Macro_F1: 0.6607617686270945 Micro_F1: 0.7866605756052993 :param file_route: :return: ''' data = du.get_data(file_route) doc_list = data.split('\n') word_lib_df = du.get_word_lib_df() label_list = [ 'acq', 'crude', 'earn', 'grain', 'interest', 'money-fx', 'ship', 'trade' ] word_lib_df['doc_num'] = [0] * word_lib_df.shape[0] # 初始化f(t) for label in label_list: word_lib_df[label] = [0] * word_lib_df.shape[0] # 初始化f(t, c) for doc in tqdm(doc_list): doc_label = doc.split('\t')[0] doc_word_list = doc.split('\t')[1].strip().split(' ') doc_word_counter = Counter(doc_word_list) for word in doc_word_counter.keys(): try: word_lib_df.loc[word, 'doc_num'] += 1 # 给f(t)赋值 word_lib_df.loc[word, doc_label] += 1 # 给f(t, c)赋值 except: continue # print(word_lib_df.head()) doc_label_list = [] row = [] col = [] value = [] doc_no = 0 for doc in tqdm(doc_list): doc_label = doc.split('\t')[0] doc_label_list.append(doc_label) doc_word_list = doc.split('\t')[1].strip().split(' ') doc_length = len(doc_word_list) doc_word_counter = Counter(doc_word_list) for word, count in doc_word_counter.items(): try: one_word_tf = count / doc_length fen_zi = 0 for label in label_list: if word_lib_df.loc[word, label] != 0: temp = word_lib_df.loc[word, label] / word_lib_df.loc[ word, 'doc_num'] fen_zi += temp * math.log(temp, 2) else: continue one_word_dc = 1 + fen_zi / math.log(len(label_list), 2) one_word_tf_dc = one_word_tf * one_word_dc # print(word+':', one_word_tf_dc) row.append(doc_no) col.append(word_lib_df.loc[word, 'index']) value.append(one_word_tf_dc) except: continue doc_no += 1 tf_dc_feature_sparse_matrix = csr_matrix( (value, (row, col)), shape=(len(doc_label_list), word_lib_df.shape[0])) le = LabelEncoder() le.fit(doc_label_list) return tf_dc_feature_sparse_matrix, le.transform(doc_label_list)
def tf_bdc(file_route): ''' KNN: 正确率:0.81 Macro_F1: 0.6809592503036148 Micro_F1: 0.7834627683873915 :param file_route: :return: ''' data = du.get_data(file_route) doc_list = data.split('\n') label_list = [] for doc in tqdm(doc_list): doc_label = doc.split('\t')[0] label_list.append(doc_label) label_list_counter = Counter(label_list) # 得到f(c) one_label_list = label_list_counter.keys( ) # ['earn', 'acq', 'trade', 'ship', 'grain', 'crude', 'interest', 'money-fx'] word_lib_df = du.get_word_lib_df() for label in one_label_list: word_lib_df[label] = [0] * word_lib_df.shape[0] # 初始化f(t, c) for doc in tqdm(doc_list): doc_label = doc.split('\t')[0] doc_word_list = doc.split('\t')[1].strip().split(' ') doc_word_counter = Counter(doc_word_list) for word in doc_word_counter.keys(): try: word_lib_df.loc[word, doc_label] += 1 # 得到f(t, c) except: continue row = [] col = [] value = [] doc_no = 0 for doc in tqdm(doc_list): doc_word_list = doc.split('\t')[1].split(' ') doc_word_counter = Counter(doc_word_list) for word, word_count in doc_word_counter.items(): try: fenzi = 0 for label1, label_count1 in label_list_counter.items(): if word_lib_df.loc[word, label1] != 0: fen_zi = word_lib_df.loc[word, label1] / label_count1 fen_mu = 0 for label2, label_count2 in label_list_counter.items(): if word_lib_df.loc[word, label2] != 0: fen_mu += word_lib_df.loc[ word, label2] / label_count2 else: continue temp = fen_zi / fen_mu fenzi += temp * math.log(temp, 2) else: continue bdc = 1 + fenzi / math.log(len(label_list_counter), 2) tf = word_count / len(doc_word_list) tf_bdc = tf * bdc # print(word+':', tf_bdc) row.append(doc_no) col.append(word_lib_df.loc[word, 'index']) value.append(tf_bdc) except: continue doc_no += 1 tf_bd_feature_sparse_matrix = csr_matrix( (value, (row, col)), shape=(len(doc_list), word_lib_df.shape[0])) le = LabelEncoder() le.fit(label_list) return tf_bd_feature_sparse_matrix, le.transform(label_list)
return (output_activations - y) ''' sigmoid(x) = 1/(1+e^-x) g(x) = e^-x g(x)的导数:-e^-x sigmoid(x)的导数 = d/dx = [0*(1+e^-x)-1*(-e^-x)]/(1+e^-x)*(1+e^-x) = e^-x / (1+e^-x)*(1+e^-x) = 1/(1+e^-x) * e^-x/(1+e^-x) = sigmoid(x) * e^(-x)/(1+e^-x) = sigmoid(x) * (1-1/(1+e^-x)) = sigmoid(x) * (1-sigmoid(x) sigmoid_prime函数对sigmoid函数进行求导 ''' # 使用一个三层神经网络来识别单个数字 if __name__ == '__main__': train_data, test_data, vali_data = data_util.get_data() net = Network([784, 30, 10]) # 学习超过30次迭代期, 小批量数据大小为10,学习速率3.0 net.SGD(train_data, 40, 10, 1, test_data=test_data) print('检验验证集') temp = net.evaluate(vali_data) count = 0 print('验证集平均正确率:', temp / len(vali_data)) # 隐藏层节点30,迭代期40,学习速率2,正确率(平均)0.834 # 隐藏层节点20,正确率(平均)0.81 # 隐藏层节点20,学习速率4,正确率0.80 # 隐藏层节点30,迭代期40,学习速率1,正确率(平均)0.89
rmsp = optimizers.RMSprop(lr=0.01) model.compile(optimizer=rmsp, loss='categorical_crossentropy', metrics=['accuracy', Precision, Recall]) model.summary() return model def train(X_train, X_test, y_train, y_test, epoch, model_exist): model = lstm_model() if model_exist: model.load_weights(model_path) history = model.fit(X_train, y_train, epochs=epoch, batch_size=256, validation_data=(X_test, y_test)) print('best val_acc', max(history.history['val_acc'])) # 保存模型 model.save(model_path) if __name__ == '__main__': X_train, X_test, y_train, y_test, word_index, index_word = du.get_data() train(X_train, X_test, y_train, y_test, epoch=10, model_exist=False)
import fire import torch import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score import scipy.optimize as opt import ml_util import torch_model import data_util import kernel args = kernel.dataset_stat() md = data_util.get_data(512, 32, *args, nw=4) trn_iter = iter(md.trn_dl) input('Press enter to start') class_counter = [0] * 28 def count(i): class_counter[i] += 1 return i for i, bd in zip(range(100), trn_iter): # print(bd[0].shape, bd[1].shape) labels = bd[1].cpu().data.numpy() label_ids = [[count(i) for i, l in enumerate(row) if l > 0.5] for row in labels] print(class_counter)
files = [["Language, Source, Period, Parametros, RMSE"], ["Language, Source, Period, Parametros, RMSE"], ["Language, Source, Period, Parametros, RMSE"]] languages = ["Assembly", "C", "CPP", "CSharp", "Dart", "Go", "Java", "JavaScript", "Julia", "ObjectiveC", "PHP", "Python", "R", "Ruby", "Rust", "Shell", "Swift"] periods = ["week", "month"] sources = [["projects", "programmers"], ["individuals", "posts"], ["analysis"]] databases = {0: "github", 1: "stackoverflow"} for language in languages: for period in periods: for i in range(len(databases)): for source in sources[i]: time, infected = get_data(databases[i] + "/" + source + "/", period, language) time = np.array(time) infected = np.array(infected) logging.basicConfig(filename='richards.log', filemode='w', level=logging.INFO) pars, pcov = curve_fit(richards, time, infected) error = rmse(infected, richards(time, *pars)) files[i].append(str([language, source, period, pars, error])) github = np.array(files[GITHUB]) stackoverflow = np.array(files[STACKOVERFLOW]) sourceforge = np.array(files[SOURCEFORGE]) np.savetxt("github.txt", github, fmt="%s") np.savetxt("stackoverflow.txt", stackoverflow, fmt="%s") np.savetxt("sourceforge.txt", sourceforge, fmt="%s")
tf.app.flags.DEFINE_integer("batch_size",30,"Batch Size") tf.app.flags.DEFINE_integer("embedding_size",100,"Embedding dimensions of encoder and decoder inputs") tf.app.flags.DEFINE_float("learning_rate",0.01,"Learning rate") tf.app.flags.DEFINE_float("keep_prob",0.5,"keep_prob") tf.app.flags.DEFINE_integer("numEpochs",30,"Maximum # of training epochs") tf.app.flags.DEFINE_string("model_dir","saves/","Path to save model checkpoints") tf.app.flags.DEFINE_string("model_name","ner.ckpt","File name used for model checkpoints") FLAGS = tf.app.flags.FLAGS train_datapath = "dataset/train.txt" test_datapath = "dataset/test.txt" sentence_int_to_vocab, sentence_vocab_to_int, tags_vocab_to_int, tags_int_to_vocab = get_sentence_int_to_vocab(train_datapath,test_datapath) pos_vocab_to_int,pos_int_to_vocab = get_posint() data = get_data(train_datapath) train_data = data vali_data = get_data(test_datapath) with tf.Session() as sess: model = LSTM_CRFModel(FLAGS.num_heads,FLAGS.num_blocks,FLAGS.rnn_size,FLAGS.embedding_size,FLAGS.learning_rate, sentence_vocab_to_int,tags_vocab_to_int,tags_int_to_vocab,pos_vocab_to_int,FLAGS.keep_prob) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("reloading model parameters....") model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("create new model parameters...") sess.run(tf.global_variables_initializer())