import sys sys.path.append("../util") from util import logTool from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.linear_model import LogisticRegression from util import logTool import random INPUTPATH_TRAIN = "../../data/sample_10" INPUTPATH_TEST = "../../data/sample_10_test" OUTPUTPATH_TRAIN = "../../data/sample_10_tfidf" OUTPUTPATH_TEST = "../../data/sample_10_tfidf_test" LOG = logTool("../../data/log/tfidf_16more") LOG.info("Starting") sentenses_train = [] labels_train = [] LOG.info("Starting loading train data") cal = 0 with open(INPUTPATH_TRAIN, "r") as f: for line in f: label, sentense = line.split(",") if label == "16": label = "1" else: if random.random() <= 0.03: label = "0" cal = cal + 1 else:
# 对原始数据进行预处理,以便进行后续计算 import sys sys.path.append("../util") from util import csvHelper, logTool INPUTPATH = "../../data/train_set.csv" OUTPUTPATH = "../../data/baseProcess" LOG = logTool("../../data/log/baseProcess") CSVHELPER_INPUT = csvHelper(INPUTPATH) f = open(OUTPUTPATH, mode="w") dataCount = 0 for dataItem in CSVHELPER_INPUT.csvRead(): if dataCount == 0: dataCount = dataCount + 1 LOG.info("start calculating") continue try: sentence = dataItem[2] # 句子 label = dataItem[3] # 标签 f.writelines("%s,%s\n" % (label, sentence)) except Exception as e: print("Error: %s,\t,%s" % (Exception, e)) LOG.error("Error: %s,\t,%s" % (Exception, e)) finally: if dataCount % 5000 == 0: print("now we have counted %s dataItems" % dataCount) LOG.info("now we have counted %s dataItems" % dataCount) dataCount = dataCount + 1
EMBED_SIZE = 100 BATCH_SIZE = 256 TEST_SIZE = 100 NUM_EPOCHS = 30 TEST_RATIO = 0.2 maxlen = 23 # [(L-kmer)/step] +1 MODEL_select = 'Class' #————————————————————————————————————————————----------------------------------- # 模型及参数保存命令 model_name = "%s_%s" % (MODEL_select, datetime.datetime.now().strftime("%Y-%m-%d")) # 判断路径是否存在,若不存在则创建新的目录 util.mkdir(".../keras_model/leave_one_out_Class/%s/" % model_name) log = logTool(".../keras_model/leave_one_out_Class/%s/log.txt" % model_name) log.info('log initiated') np.random.seed(1671) #----------------------------------------------------------------------------------------------------------------------- # 数据加载及划分 glove_inputpath = "...\Data\Class_leave_one_out\keras_GloVeVec_5_100_10000.csv" inputpath = "...\Data\Class_leave_one_out\off_Glove.txt" # load GloVe model model_glove = loadGlove(glove_inputpath) embedding_weights = np.zeros( (VOCAB_SIZE, EMBED_SIZE)) # Number of words x embedded dimensions for i in range(VOCAB_SIZE): embedding_weights[i, :] = model_glove[str(i)] print('GloVe model loaded') log.info("GloVe model loaded")
EMBED_SIZE = 100 BATCH_SIZE = 256 TEST_SIZE = 100 NUM_EPOCHS = 20 TEST_RATIO = 0.2 maxlen = 23 # [(L-kmer)/step] +1 MODEL_select = 'Class_Conv_LSTM' class_weight = dict({1: 1, 0: 250}) #————————————————————————————————————————————----------------------------------- model_name = "%s_%s" % ( MODEL_select, datetime.datetime.now().strftime("%Y-%m-%d")) util.mkdir(".../keras_model/Class/%s/" % model_name) log = logTool(".../keras_model/Class/%s/log.txt" % model_name) log.info('log initiated') np.random.seed(1671) #----------------------------------------------------------------------------------------------------------------------- glove_inputpath = "...\Data\Class\keras_GloVeVec_5_100_10000.csv" hek_inputpath = "...\Data\Class\hek293_off_Glove.txt" K562_inputpath = "...\Data\Class\K562_off_Glove.txt" # load GloVe model model_glove = loadGlove(glove_inputpath) embedding_weights = np.zeros((VOCAB_SIZE, EMBED_SIZE)) for i in range(VOCAB_SIZE): embedding_weights[i, :] = model_glove[str(i)] print('GloVe model loaded') log.info("GloVe model loaded")
# word2vector测试 import sys sys.path.append("../util") from util import logTool from gensim.models import Word2Vec INPUTPATH = "../../data/sample_10" MODELSAVEDPATH = "../../data/model/Word2Vec2018722_1.model" LOG = logTool("../../data/log/word2vector") model = Word2Vec.load(MODELSAVEDPATH) # print(model.vocabulary.__dict__) print(len(model.wv.vocab)) print(model['1138901'])
import numpy as np import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from util import logTool from model_ml import model_ml from model_evaluation import model_report # 配置值 POS_SW_PATH = "../../../data/shopping_review/pos_pp.txt" NEG_SW_PATH = "../../../data/shopping_review/neg_pp.txt" LOG_PATH = "../../../data/shopping_review/log.txt" MODEL_NUM = "1" # 日志初始化 log = logTool(LOG_PATH) log.info('log initiated') # 数据读取 pos_sw = [] neg_sw = [] with open(POS_SW_PATH) as f: for line in f: pos_sw.append(line) with open(NEG_SW_PATH) as f: for line in f: neg_sw.append(line) log.info("data imported") # 打标签 y = np.concatenate((np.ones(len(pos_sw)), np.zeros(len(neg_sw))))
EMBED_SIZE = 100 BATCH_SIZE = 256 TEST_SIZE = 100 NUM_EPOCHS = 20 TEST_RATIO = 0.2 maxlen = 23 # [(L-kmer)/step] +1 MODEL_select = 'Reg' class_weight = dict({1: 1, 0: 250}) #————————————————————————————————————————————----------------------------------- # 模型及参数保存命令 model_name = "%s_%s" % ( MODEL_select, datetime.datetime.now().strftime("%Y-%m-%d")) util.mkdir(".../keras_model/Reg/%s/" % model_name) log = logTool(".../keras_model/Reg/%s/log.txt" % model_name) log.info('log initiated') np.random.seed(1671) #----------------------------------------------------------------------------------------------------------------------- glove_inputpath = "...\Data\Reg\keras_GloVeVec_5_100_10000.csv" hek_inputpath = "...\Data\Reg\hek293_off_Glove.txt" K562_inputpath = "...\Data\Reg\K562_off_Glove.txt" # load GloVe model model_glove = loadGlove(glove_inputpath) embedding_weights = np.zeros((VOCAB_SIZE, EMBED_SIZE)) for i in range(VOCAB_SIZE): embedding_weights[i, :] = model_glove[str(i)] print('GloVe model loaded') log.info("GloVe model loaded")
import sys sys.path.append("../util") from util import logTool from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.externals import joblib INPUTPATH_TRAIN = "../../data/sample_30" INPUTPATH_TEST = "../../data/sample_10_test" OUTPUTPATH_TRAIN = "../../data/sample_30_tfidf" OUTPUTPATH_TEST = "../../data/sample_30_tfidf_test" MODELPATH = "../../data/model/LR_30_tfidf" LOG = logTool("../../data/log/tfidf_LR") LOG.info("Starting") sentenses_train = [] labels_train = [] LOG.info("Starting loading train data") with open(INPUTPATH_TRAIN, "r") as f: for line in f: label, sentense = line.split(",") labels_train.append(label) sentenses_train.append(sentense) print("finish loading train data") LOG.info("finish loading train data") sentenses_test = [] labels_test = [] LOG.info("Starting loading test data") with open(INPUTPATH_TEST, "r") as f:
import sys sys.path.append("../util") from util import logTool from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.externals import joblib INPUTPATH_TRAIN = "../../data/sample_30" INPUTPATH_TEST = "../../data/sample_10_test" OUTPUTPATH_TRAIN = "../../data/sample_30_tfidf" OUTPUTPATH_TEST = "../../data/sample_30_tfidf_test" MODELPATH = "../../data/model/MLP_30_tfidf" LOG = logTool("../../data/log/tfidf_MLP") LOG.info("Starting") sentenses_train = [] labels_train = [] LOG.info("Starting loading train data") with open(INPUTPATH_TRAIN, "r") as f: for line in f: label, sentense = line.split(",") labels_train.append(label) sentenses_train.append(sentense) print("finish loading train data") LOG.info("finish loading train data") sentenses_test = [] labels_test = [] LOG.info("Starting loading test data") with open(INPUTPATH_TEST, "r") as f:
# 抽样计算 import sys sys.path.append("../util") from util import logTool import random INPUTPATH = "../../data/baseProcess" OUTPUTPATH = "../../data/sample_30" LOG = logTool("../../data/log/sampling") sample_rate = 0.3 fout = open(OUTPUTPATH, "w") with open(INPUTPATH, "r") as f: index = 0 numbers = 0 for line in f: if random.random() <= sample_rate: numbers = numbers + 1 fout.write(line) index = index + 1 if index % 1000 == 0: print("we have scan %s data and %s of them is seleted" % (index, numbers)) LOG.info("we have scan %s data and %s of them is seleted" % (index, numbers)) fout.close()