Beispiel #1
0
import sys
sys.path.append("../util")
from util import logTool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from util import logTool
import random

INPUTPATH_TRAIN = "../../data/sample_10"
INPUTPATH_TEST = "../../data/sample_10_test"
OUTPUTPATH_TRAIN = "../../data/sample_10_tfidf"
OUTPUTPATH_TEST = "../../data/sample_10_tfidf_test"
LOG = logTool("../../data/log/tfidf_16more")

LOG.info("Starting")
sentenses_train = []
labels_train = []
LOG.info("Starting loading train data")
cal = 0
with open(INPUTPATH_TRAIN, "r") as f:
    for line in f:
        label, sentense = line.split(",")
        if label == "16":
            label = "1"
        else:
            if random.random() <= 0.03:
                label = "0"
                cal = cal + 1
            else:
Beispiel #2
0
# 对原始数据进行预处理,以便进行后续计算

import sys
sys.path.append("../util")
from util import csvHelper, logTool

INPUTPATH = "../../data/train_set.csv"
OUTPUTPATH = "../../data/baseProcess"
LOG = logTool("../../data/log/baseProcess")

CSVHELPER_INPUT = csvHelper(INPUTPATH)
f = open(OUTPUTPATH, mode="w")

dataCount = 0
for dataItem in CSVHELPER_INPUT.csvRead():
    if dataCount == 0:
        dataCount = dataCount + 1
        LOG.info("start calculating")
        continue
    try:
        sentence = dataItem[2]  # 句子
        label = dataItem[3]  # 标签
        f.writelines("%s,%s\n" % (label, sentence))
    except Exception as e:
        print("Error: %s,\t,%s" % (Exception, e))
        LOG.error("Error: %s,\t,%s" % (Exception, e))
    finally:
        if dataCount % 5000 == 0:
            print("now we have counted %s dataItems" % dataCount)
            LOG.info("now we have counted %s dataItems" % dataCount)
    dataCount = dataCount + 1
Beispiel #3
0
EMBED_SIZE = 100
BATCH_SIZE = 256
TEST_SIZE = 100
NUM_EPOCHS = 30
TEST_RATIO = 0.2
maxlen = 23  # [(L-kmer)/step] +1
MODEL_select = 'Class'

#————————————————————————————————————————————-----------------------------------
# 模型及参数保存命令
model_name = "%s_%s" % (MODEL_select,
                        datetime.datetime.now().strftime("%Y-%m-%d"))
# 判断路径是否存在,若不存在则创建新的目录
util.mkdir(".../keras_model/leave_one_out_Class/%s/" % model_name)

log = logTool(".../keras_model/leave_one_out_Class/%s/log.txt" % model_name)
log.info('log initiated')
np.random.seed(1671)
#-----------------------------------------------------------------------------------------------------------------------
# 数据加载及划分
glove_inputpath = "...\Data\Class_leave_one_out\keras_GloVeVec_5_100_10000.csv"
inputpath = "...\Data\Class_leave_one_out\off_Glove.txt"
# load GloVe model
model_glove = loadGlove(glove_inputpath)
embedding_weights = np.zeros(
    (VOCAB_SIZE, EMBED_SIZE))  # Number of words x embedded dimensions
for i in range(VOCAB_SIZE):
    embedding_weights[i, :] = model_glove[str(i)]
print('GloVe model loaded')
log.info("GloVe model loaded")
EMBED_SIZE = 100
BATCH_SIZE = 256
TEST_SIZE = 100
NUM_EPOCHS = 20
TEST_RATIO = 0.2
maxlen = 23  # [(L-kmer)/step] +1
MODEL_select = 'Class_Conv_LSTM'
class_weight = dict({1: 1, 0: 250})

#————————————————————————————————————————————-----------------------------------
model_name = "%s_%s" % (
    MODEL_select, datetime.datetime.now().strftime("%Y-%m-%d"))

util.mkdir(".../keras_model/Class/%s/" % model_name)

log = logTool(".../keras_model/Class/%s/log.txt" % model_name)
log.info('log initiated')
np.random.seed(1671)

#-----------------------------------------------------------------------------------------------------------------------
glove_inputpath = "...\Data\Class\keras_GloVeVec_5_100_10000.csv"
hek_inputpath = "...\Data\Class\hek293_off_Glove.txt"
K562_inputpath = "...\Data\Class\K562_off_Glove.txt"
# load GloVe model
model_glove = loadGlove(glove_inputpath)
embedding_weights = np.zeros((VOCAB_SIZE, EMBED_SIZE))
for i in range(VOCAB_SIZE):
    embedding_weights[i, :] = model_glove[str(i)]
print('GloVe model loaded')
log.info("GloVe model loaded")
# word2vector测试
import sys
sys.path.append("../util")
from util import logTool
from gensim.models import Word2Vec

INPUTPATH = "../../data/sample_10"
MODELSAVEDPATH = "../../data/model/Word2Vec2018722_1.model"
LOG = logTool("../../data/log/word2vector")

model = Word2Vec.load(MODELSAVEDPATH)
# print(model.vocabulary.__dict__)
print(len(model.wv.vocab))
print(model['1138901'])
Beispiel #6
0
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from util import logTool
from model_ml import model_ml
from model_evaluation import model_report

# 配置值
POS_SW_PATH = "../../../data/shopping_review/pos_pp.txt"
NEG_SW_PATH = "../../../data/shopping_review/neg_pp.txt"
LOG_PATH = "../../../data/shopping_review/log.txt"
MODEL_NUM = "1"

# 日志初始化
log = logTool(LOG_PATH)
log.info('log initiated')

# 数据读取
pos_sw = []
neg_sw = []
with open(POS_SW_PATH) as f:
    for line in f:
        pos_sw.append(line)
with open(NEG_SW_PATH) as f:
    for line in f:
        neg_sw.append(line)
log.info("data imported")

# 打标签
y = np.concatenate((np.ones(len(pos_sw)), np.zeros(len(neg_sw))))
EMBED_SIZE = 100
BATCH_SIZE = 256
TEST_SIZE = 100
NUM_EPOCHS = 20
TEST_RATIO = 0.2
maxlen = 23  # [(L-kmer)/step] +1
MODEL_select = 'Reg'
class_weight = dict({1: 1, 0: 250})
#————————————————————————————————————————————-----------------------------------
# 模型及参数保存命令
model_name = "%s_%s" % (
    MODEL_select, datetime.datetime.now().strftime("%Y-%m-%d"))

util.mkdir(".../keras_model/Reg/%s/" % model_name)

log = logTool(".../keras_model/Reg/%s/log.txt" % model_name)
log.info('log initiated')
np.random.seed(1671)

#-----------------------------------------------------------------------------------------------------------------------

glove_inputpath = "...\Data\Reg\keras_GloVeVec_5_100_10000.csv"
hek_inputpath = "...\Data\Reg\hek293_off_Glove.txt"
K562_inputpath = "...\Data\Reg\K562_off_Glove.txt"
# load GloVe model
model_glove = loadGlove(glove_inputpath)
embedding_weights = np.zeros((VOCAB_SIZE, EMBED_SIZE))
for i in range(VOCAB_SIZE):
    embedding_weights[i, :] = model_glove[str(i)]
print('GloVe model loaded')
log.info("GloVe model loaded")
Beispiel #8
0
import sys
sys.path.append("../util")
from util import logTool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.externals import joblib

INPUTPATH_TRAIN = "../../data/sample_30"
INPUTPATH_TEST = "../../data/sample_10_test"
OUTPUTPATH_TRAIN = "../../data/sample_30_tfidf"
OUTPUTPATH_TEST = "../../data/sample_30_tfidf_test"
MODELPATH = "../../data/model/LR_30_tfidf"
LOG = logTool("../../data/log/tfidf_LR")

LOG.info("Starting")
sentenses_train = []
labels_train = []
LOG.info("Starting loading train data")
with open(INPUTPATH_TRAIN, "r") as f:
    for line in f:
        label, sentense = line.split(",")
        labels_train.append(label)
        sentenses_train.append(sentense)
    print("finish loading train data")
    LOG.info("finish loading train data")

sentenses_test = []
labels_test = []
LOG.info("Starting loading test data")
with open(INPUTPATH_TEST, "r") as f:
Beispiel #9
0
import sys
sys.path.append("../util")
from util import logTool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.externals import joblib

INPUTPATH_TRAIN = "../../data/sample_30"
INPUTPATH_TEST = "../../data/sample_10_test"
OUTPUTPATH_TRAIN = "../../data/sample_30_tfidf"
OUTPUTPATH_TEST = "../../data/sample_30_tfidf_test"
MODELPATH = "../../data/model/MLP_30_tfidf"
LOG = logTool("../../data/log/tfidf_MLP")

LOG.info("Starting")
sentenses_train = []
labels_train = []
LOG.info("Starting loading train data")
with open(INPUTPATH_TRAIN, "r") as f:
    for line in f:
        label, sentense = line.split(",")
        labels_train.append(label)
        sentenses_train.append(sentense)
    print("finish loading train data")
    LOG.info("finish loading train data")

sentenses_test = []
labels_test = []
LOG.info("Starting loading test data")
with open(INPUTPATH_TEST, "r") as f:
Beispiel #10
0
# 抽样计算
import sys
sys.path.append("../util")
from util import logTool
import random

INPUTPATH = "../../data/baseProcess"
OUTPUTPATH = "../../data/sample_30"
LOG = logTool("../../data/log/sampling")

sample_rate = 0.3

fout = open(OUTPUTPATH, "w")
with open(INPUTPATH, "r") as f:

    index = 0
    numbers = 0
    for line in f:
        if random.random() <= sample_rate:
            numbers = numbers + 1
            fout.write(line)
        index = index + 1
        if index % 1000 == 0:
            print("we have scan %s data and %s of them is seleted" %
                  (index, numbers))
            LOG.info("we have scan %s data and %s of them is seleted" %
                     (index, numbers))
fout.close()