コード例 #1
0
ファイル: train.py プロジェクト: Batman001/lstm-cnn
def train():

    tensorboard_dir = './tensorboard/Lstm_CNN'
    save_dir = './checkpoints/Lstm_CNN'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_path = os.path.join(save_dir, 'best_validation')

    tf.summary.scalar('loss', model.loss)
    tf.summary.scalar('accuracy', model.accuracy)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    saver = tf.train.Saver()
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print("Preparing the training data....")
    x_train, y_train = process(pm.train_filename,
                               word_ids,
                               cat_to_id,
                               max_length=300)
    print("Preparing the testing data....")
    x_test, y_test = process(pm.test_filename,
                             word_ids,
                             cat_to_id,
                             max_length=300)
    for epoch in range(pm.num_epochs):
        print('Epoch:', epoch + 1)
        num_batchs = int((len(x_train) - 1) / pm.batch_size) + 1
        batch_train = batch_iter(x_train, y_train, batch_size=pm.batch_size)
        for x_batch, y_batch in batch_train:
            real_seq_len = seq_length(x_batch)
            feed_dict = model.feed_data(x_batch, y_batch, real_seq_len,
                                        pm.keep_prob)
            _, global_step, _summary, train_loss, train_accuracy = session.run(
                [
                    model.optimizer, model.global_step, merged_summary,
                    model.loss, model.accuracy
                ],
                feed_dict=feed_dict)
            if global_step % 100 == 0:
                test_loss, test_accuracy = model.test(session, x_test, y_test)
                print('global_step:', global_step, 'train_loss:', train_loss,
                      'train_accuracy:', train_accuracy, 'test_loss:',
                      test_loss, 'test_accuracy:', test_accuracy)

            if global_step % num_batchs == 0:
                print('Saving Model...')
                saver.save(session, save_path, global_step=global_step)
コード例 #2
0
    def _load(self, use_user_info):
        '''
		load cache_size lines from the last ending
		:return: whether the file ends
		'''
        if self.max_lines:
            lines = list(
                islice(
                    self.file,
                    min([self.cache_size,
                         self.max_lines - self.current_line])))
        else:
            lines = list(islice(self.file, self.cache_size))
            if self.drop_val and len(lines) < self.cache_size:
                lines = lines[:-val_size]
        if not lines:
            return False
        self.current_line += len(lines)
        if self.shuffle:
            permuation = np.random.permutation(len(lines))
            lines = [lines[i].strip().split('\x01') for i in permuation]
        else:
            lines = [line.strip().split('\x01') for line in lines]
        self.data = [(process(lines[i:i + batch_size],
                              self.token_embedding_level, use_user_info))
                     for i in range(0, len(lines), batch_size)]
        return True
コード例 #3
0
def load_data(file_path,
              queue,
              offset,
              stride,
              cache_size,
              use_user_info,
              shuffle=True,
              drop_val=True):
    f = open(file_path, encoding="utf-8")
    list(islice(f, offset * cache_size))
    while True:
        lines = list(islice(f, cache_size))
        if drop_val and len(lines) < cache_size:
            lines = lines[:-val_size]
        if not lines:
            f.seek(0)
            list(islice(f, offset * cache_size))
            lines = list(islice(f, cache_size))
        if shuffle:
            permuation = np.random.permutation(len(lines))
            lines = [lines[i].strip().split('\x01') for i in permuation]
        else:
            lines = [line.strip().split('\x01') for line in lines]
        for i in range(0, len(lines), batch_size):
            queue.put(process(lines[i:i + batch_size], None, use_user_info))
        list(islice(f, (stride - 1) * cache_size))
コード例 #4
0
def get_positions():
    position_dic, position_counter, top_person, positions_set, frame = process()
    model=gensim.models.Word2Vec.load('output_word.model')
    new_positions_dict = {}
    model=gensim.models.Word2Vec.load('output_word.model')
    category_list = ['software','product', 'consultant','analyst', 'accounting','manufacturing','sports','banking','fundraiser','fashion','information',
                     'operation', 'market', 'reporter', 'sales', 'finance', 'hr', 'public','technology','business']
    count = 0
    for word in positions_set:
        old_word = word
        new_position = ''
        word = re.sub('[^0-9a-zA-Z]+', ' ', word)
        word_ist = word.split()
        max_similarity = 0
        for i in word_ist:
            for j in category_list:
                simi = 0
                try:
                    simi = model.similarity(i.lower(), j)
                except:

                    continue
                if simi >= max_similarity:
                    max_similarity = simi
                    new_position = j
        if new_position not in category_list:
            new_position = 'other'
            count += 1
        new_positions_dict[old_word] = new_position
    # print count
    print new_positions_dict

    return new_positions_dict
コード例 #5
0
def post_classification_result():
    app.logger.debug('/hitec/classify/concepts/seanmf/run called')

    timestamp = '{:%Y-%m-%d_%H%M%S-%f}'.format(datetime.datetime.now())

    # app.logger.debug(request.data.decode('utf-8'))
    content = json.loads(request.data.decode('utf-8'))

    app.logger.info(content)

    # save content
    dataset = content["dataset"]["documents"]

    with open('data/data_' + timestamp + ".txt", 'w') as out_file:
        for doc in dataset:
            out_file.write(doc["text"] + '\n')

    # get parameter
    params = content["params"]

    # start pre-processing
    data_process.process(timestamp,
                         vocab_min_count=int(params["vocab_min_count"]))

    # start concept detection
    train.train(timestamp, content["method"], float(params["alpha"]),
                float(params["beta"]), int(params["n_topics"]),
                int(params["max_iter"]), float(params["max_err"]),
                params["fix_random"] == "true")

    # prepare results
    topics, doc_topic = results.prepare_results(timestamp)

    res = dict()

    res.update({"topics": topics})
    res.update({"doc_topic": doc_topic})

    # calculate metrics
    metrics = vis_topic.get_metrics(timestamp)
    res.update({"metrics": metrics})

    # cleanup
    utils.cleanup(timestamp)

    # send results back
    return jsonify(res)
コード例 #6
0
ファイル: gui_app.py プロジェクト: YuNotGary/ICT
 def run(self) -> None:  # actually run to process the tasks
     num = len(self.tasks)  # get num of tasks
     for n, (fp, sh) in enumerate(self.tasks):  # enumerate each task
         try:  # try-except helps to avoid crashing the app when something bad happens
             self.sig_msg.emit(f'<p><span style="color: blue;">[{n+1}/{num}]</span>, Normalising <b>{fp}</b></p>'
                               )  # emit a message telling what's going on
             r = process(fp=fp, shiftInput=sh)  # call process function the normalise the raw file
             self.sig_result.emit(fp, r)  # emit the results
             self.sig_msg.emit(
                 f'<p><span style="color: blue;">[{n + 1}/{num}]</span>,'
                 f' <span style="color: green;">[SUCCEEDED]</span> to normalise {fp}</p>')  # emit a success message
         except:
             self.sig_msg.emit(f'<p><span style="color: blue;">[{n + 1}/{num}]</span>,'
                               f' <span style="color: red;">[FAILED]</span> to normalise {fp}</p>')  # emit a failed message
     self.sig_finished.emit()  # emit a message "all tasks processed"
コード例 #7
0
    def do_POST(self):
        content_length = int(
            self.headers['Content-Length'])  # <--- Gets the size of data

        self.data_string = self.rfile.read(content_length)

        data = simplejson.loads(self.data_string)

        logging.info("POST request,\nPath: %s\nHeaders:\n%s\n", str(self.path),
                     str(self.headers))

        bpm = data["BPM"]
        audio = data["AudioData"]

        ##PASS THE DATA TO SCRIPT PROCESS
        data_to_return = data_process.process(audio, bpm)

        self._set_response()
        json_string = json.dumps(data_to_return)
        self.wfile.write(json_string.encode(encoding='utf_8'))
コード例 #8
0
def predict():

    pre_labels = []
    labels = []
    session = tf.Session()
    save_path = tf.train.latest_checkpoint('./checkpoints/Lstm_CNN')
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)

    val_x, val_y = process(pm.val_filename,
                           word_to_ids,
                           cat_to_id,
                           max_length=pm.seq_length)
    batch_val = batch_iter(val_x, val_y, batch_size=64)

    for x_batch, y_batch in batch_val:
        real_seq_len = seq_length(x_batch)
        feed_dict = model.feed_data(x_batch, y_batch, real_seq_len, 1.0)
        predict_label = session.run(model.predict, feed_dict=feed_dict)
        pre_labels.extend(predict_label)
        labels.extend(y_batch)
    return pre_labels, labels
コード例 #9
0
import os
import pandas as pd
import numpy as np
from data_process import process
INPUT_FOLDER = '/labs/colab/3DDD/kaggle_data/sample_images/'
patients = os.listdir(INPUT_FOLDER)
patients.sort()
labels = pd.read_csv('/labs/colab/3DDD/kaggle_data/stage1_labels.csv', index_col=0)
sample_data = []

for num, patientID in enumerate(patients):
    try:
        label = labels.get_value(patientID, 'cancer')
        rnnInput,label = process(INPUT_FOLDER + patientID, label, rsp=False)
        sample_data.append( (rnnInput,label) )
        print 'Patient',num,'processed.'
    except KeyError as err:
        print 'Unlabeled patient:', num, "passed."

        
np.save('sample_data.npy', sample_data)
print 'pre-processed data saved.'


コード例 #10
0
import numpy as np
from data_process import process


# generate some data for training
process_data = process()
X = []
y = []
for i in range(60):
    dim = i + 1
    print ('curr iter: %d' % i)
    #for N in range(10000, 110000, 10000):
    for N in range(50000,150000,10000):
        layer_dims = [np.random.randint(60, 80), 100]
        data = process_data.gen_data(dim, layer_dims, N).astype('float32')
        eigenvlues = process_data.get_eigenvalues(data)
        X.append(eigenvlues)
        y.append(dim)

X = np.array(X)
y = np.array(y)

np.savez('train_data.npz', X=X, y=y)
print('completed data generation')
コード例 #11
0
ファイル: train_data.py プロジェクト: Kn15263ss/ML2017
def main():

    nltk.download("stopwords")
    data_process = process()

    base_dir, result_dir = data_process.get_path()
    store_path, history_data = data_process.history_data(result_dir, nb_epoch)

    ### read training and testing data
    (Y_data, X_data, tag_list) = data_process.read_data(train_path, True)
    (_, X_test, _) = data_process.read_data(test_path, False)
    all_corpus_temp = X_data + X_test

    train_tag = data_process.to_multi_categorical(Y_data, tag_list)

    # #part3
    # train_tag_all = np.sum(train_tag, axis=0)
    # train_tag_all = np.array(train_tag_all, dtype = "int")
    # for i in range(len(tag_list)):
    #     print (str(tag_list[i])+":"+" "+str(train_tag_all[i]))

    ### RNN
    #-----------------------------------------------------------------------------------------------------------------------------------------
    ### tokenizer for all data
    tokenizer_temp = Tokenizer()
    tokenizer_temp.fit_on_texts(all_corpus_temp)
    word_index_temp = tokenizer_temp.word_index

    all_corpus = [
        w for w in word_index_temp if not w in stopwords.words("english")
    ]

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_corpus)
    word_index = tokenizer.word_index

    tokenizer_file = os.path.join(store_path, "tokenizer.pickle")
    data_process.store_pickle(tokenizer, tokenizer_file)

    label_file = os.path.join(store_path, "label.pickle")
    data_process.store_pickle(tag_list, label_file)

    ### convert word sequences to index sequence
    train_sequences = tokenizer.texts_to_sequences(X_data)
    test_sequences = tokenizer.texts_to_sequences(X_test)

    ### padding to equal length
    train_sequences = pad_sequences(train_sequences, maxlen=max_article_length)
    test_sequences = pad_sequences(test_sequences, maxlen=max_article_length)

    ### split data into training set and validation set
    (X_train, Y_train), (X_val, Y_val) = data_process.split_data(
        train_sequences, train_tag, split_ratio)

    ### get mebedding matrix from glove
    embedding_dict = data_process.get_embedding_dict('glove.6B.%dd.txt' %
                                                     embedding_dim)
    num_words = len(word_index) + 1
    embedding_matrix = data_process.get_embedding_matrix(
        word_index, embedding_dict, num_words, embedding_dim)

    ### build model
    model = Sequential()
    model.add(
        Embedding(num_words,
                  embedding_dim,
                  weights=[embedding_matrix],
                  input_length=max_article_length,
                  trainable=False))

    model.add(GRU(128, activation='tanh', dropout=0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(len(tag_list), activation='sigmoid'))
    model.summary()

    #rms = RMSprop(lr = 0.0005)
    model.compile(loss='categorical_crossentropy',
                  optimizer="adam",
                  metrics=[data_process.f1_score])

    earlystopping = EarlyStopping(monitor='val_f1_score',
                                  patience=10,
                                  verbose=1,
                                  mode='max')
    checkpoint = ModelCheckpoint(filepath='best.hdf5',
                                 verbose=1,
                                 save_best_only=True,
                                 save_weights_only=False,
                                 monitor='val_f1_score',
                                 mode='max')
    #csv_logger = CSVLogger('RNN.log')

    hist = model.fit(X_train,
                     Y_train,
                     validation_data=(X_val, Y_val),
                     epochs=nb_epoch,
                     batch_size=batch_size,
                     callbacks=[history_data, earlystopping, checkpoint])
    #callbacks=[history_data,earlystopping,checkpoint,csv_logger])

    data_process.dump_history(store_path, history_data)
    plot_model(model, to_file=os.path.join(store_path, 'model.png'))
コード例 #12
0
ファイル: run.py プロジェクト: bangrui/CS-6780-Project
    mean_reward = total_reward * 1.0 / n_replications
    var = var * 1.0 / n_replications - mean_reward * mean_reward
    CI = [
        mean_reward - 1.96 * np.sqrt(var / n_replications),
        mean_reward + 1.96 * np.sqrt(var / n_replications)
    ]
    mean_regret = total_regret * 1.0 / n_replications
    return mean_regret, mean_reward, CI


if __name__ == '__main__':

    option = 2  # 1: generate new; 2: read from files
    n_categories = 20
    n_reviews = 20
    featurelist, mean, cov = data.process(option, n_categories, n_reviews)
    print 'data processed'
    sigma = 0.8
    n_steps = 2275

    np.random.seed(1)

    #U = user.user(mean, cov, sigma)
    #S = server.server(mean, cov, sigma, featurelist, n_steps)

    #restaurant = S.recommand()
    #print restaurant
    print server.ALG()

    n_replications = 100
    mean_regret, mean_reward, CI = run(n_replications, n_steps, mean, cov,
コード例 #13
0
ファイル: run.py プロジェクト: bangrui/CS-6780-Project
    mean_reward = total_reward * 1.0 / n_replications
    var = var * 1.0 / n_replications - mean_reward * mean_reward
    CI = [mean_reward - 1.96 * np.sqrt(var/n_replications), mean_reward + 1.96 * np.sqrt(var/n_replications)]
    mean_regret = total_regret * 1.0 / n_replications
    return mean_regret, mean_reward, CI




if __name__ == '__main__':
    
    option = 2 # 1: generate new; 2: read from files
    n_categories = 20
    n_reviews = 20
    featurelist, mean, cov = data.process(option, n_categories, n_reviews)
    print 'data processed'
    sigma = 0.8
    n_steps = 2275
    
    np.random.seed(1)
    
    #U = user.user(mean, cov, sigma)
    #S = server.server(mean, cov, sigma, featurelist, n_steps)
    
    #restaurant = S.recommand()
    #print restaurant
    print server.ALG()
    
    
    n_replications = 100
コード例 #14
0
from data_process import Token, process, batch_iter
from bert_classify import *
import os

#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

best = 0.0
n = 0

print("Loading Training data...")
input_id, input_segment_, mask_, label = Token(pm.train_filename)

test_id, test_segment, test_mask, test_label = Token(pm.test_filename)

label = process(label)


def evaluate(sess, test_id, test_segment, test_mask, test_label):
    A = 1e-10
    for i in range(len(test_label)):
        pre_lab = sess.run(predict,
                           feed_dict={
                               input_x: [test_id[i]],
                               input_segment: [test_segment[i]],
                               mask: [test_mask[i]],
                               keep_pro: 1.0
                           })
        result = test_label[i]
        if int(pre_lab) == int(result):
            A += 1
コード例 #15
0
ファイル: test_data.py プロジェクト: Kn15263ss/ML2017
import os
import sys
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from data_process import process

test_path = sys.argv[1]
result_path = sys.argv[2]

max_article_length = 190
range_value = 0.4

data_process = process()

(_, X_test,_) = data_process.read_data(test_path,False)

tag_list = data_process.load_pickle("./model/label.pickle")
tokenizer = data_process.load_pickle("./model/tokenizer.pickle")

test_sequences = tokenizer.texts_to_sequences(X_test)
test_sequences = pad_sequences(test_sequences,maxlen=max_article_length)

model = load_model("./model/best.hdf5", custom_objects={"f1_score": data_process.f1_score})

Y_pred = model.predict(test_sequences)

linfnorm = np.linalg.norm(Y_pred, axis=1, ord=np.inf)
preds = Y_pred.astype(np.float) / linfnorm[:, None]
コード例 #16
0
ファイル: main.py プロジェクト: yechunli/traffice_classfy
num_file = len(csvfile_list)
#数据集文件列表
#filename_list = ['F:\业务分类\sourcecode\datasets\pcap\Game.pcap']
#保存处理后的数据集的文件
#csvfile_list = ['game.csv']
#每条数据的长度
DATA_LENGTH = 1500
#存储每个数据集文件中数据的个数
num_list = []

for index in range(num_file):
    #w表示写入,newline=“”表示两行写入之间不空行
    with open(csvfile_list[index], 'w', newline="") as file:
        #将数据截取为1500字节长度,其中UDP包头长度后补12个0,与TCP对齐,并把每字节数据换算成0-255之间的数,并作0-1标准化
        #最后写入csvfile_list中对应的文件中
        num = process(file, filename_list[index], DATA_LENGTH)
        num_list.append(num)

#选出数据个数最少的
min_num = min(num_list)
#计算丢弃个数
drop_num = [x - min_num for x in num_list]
#选取数据个数最少的,把比最少的数据集个数多的数据集中多出的部分全部随机丢弃
random_drop(csvfile_list, num_list, drop_num)
#对多个文件的数据concat,随机拆分为训练集和测试集,并添加label
data_concat(csvfile_list, train_list, test_list)

train_model(DATA_LENGTH, num_file)
# import numpy as np
# print(x_train)
# print(np.shape(x_train))