def load_mul_features(feature_pt, feature_names, rawset_name, will_save): index_begin = 0 features = None for index in reversed(range(1, len(feature_names))): f_names_s = '|'.join( feature_names[0:index + 1]) + '|' + rawset_name f_names_md5 = hashlib.md5(f_names_s).hexdigest() if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)): index_begin = index features = Feature.load('%s/md5_%s.smat' % (feature_pt, f_names_md5)) break LogUtil.log( 'INFO', 'load %s features from index(%d)' % (rawset_name, index_begin)) if 1 > index_begin: features = Feature.load( '%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name)) for index in range(index_begin + 1, len(feature_names)): features = Feature.merge_col( features, Feature.load('%s/%s.%s.smat' % (feature_pt, feature_names[index], rawset_name))) features = features.tocsr() if will_save and (index_begin < len(feature_names) - 1): f_names_s = '|'.join(feature_names) + '|' + rawset_name f_names_md5 = hashlib.md5(f_names_s).hexdigest() Feature.save(features, '%s/md5_%s.smat' % (feature_pt, f_names_md5)) return features
def load_smat(ft_fp): ''' 加载特征文件,特征文件格式如下: row_num col_num f1_index:f1_value f2_index:f2_value ... ''' data = [] indice = [] indptr = [0] f = open(ft_fp) [row_num, col_num] = [int(num) for num in f.readline().strip().split()] for line in f: line = line.strip() subs = line.split() for sub in subs: [f_index, f_value] = sub.split(":") f_index = int(f_index) f_value = float(f_value) data.append(f_value) indice.append(f_index) indptr.append(len(data)) f.close() features = csr_matrix((data, indice, indptr), shape=(row_num, col_num), dtype=float) LogUtil.log("INFO", "load smat feature file done (%s)" % ft_fp) return features
def save_smat(features, ft_pt): ''' 存储特征文件 ''' (row_num, col_num) = features.shape data = features.data indice = features.indices indptr = features.indptr f = open(ft_pt, 'w') f.write("%d %d\n" % (row_num, col_num)) ind_indptr = 1 begin_line = True for ind_data in range(len(data)): while ind_data == indptr[ind_indptr]: f.write('\n') begin_line = True ind_indptr += 1 if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12): continue if (not begin_line) and (ind_data != indptr[ind_indptr - 1]): f.write(' ') f.write("%d:%s" % (indice[ind_data], data[ind_data])) begin_line = False while ind_indptr < len(indptr): f.write("\n") ind_indptr += 1 LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt) f.close()
def save_smat(features, ft_pt): """ save features to disk in SMAT format :param features: the matrix of features :param ft_pt: features file path :return: none """ (row_num, col_num) = features.shape data = features.data indice = features.indices indptr = features.indptr f = open(ft_pt, 'w') f.write("%d %d\n" % (row_num, col_num)) ind_indptr = 1 begin_line = True for ind_data in range(len(data)): while ind_data == indptr[ind_indptr]: f.write('\n') begin_line = True ind_indptr += 1 if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12): continue if (not begin_line) and (ind_data != indptr[ind_indptr - 1]): f.write(' ') f.write("%d:%s" % (indice[ind_data], data[ind_data])) begin_line = False while ind_indptr < len(indptr): f.write("\n") ind_indptr += 1 LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt) f.close()
def run_gen_feature_extra(cf): """ 生成额外训练数据 :param conf_fp: :return: """ # 读取配置文件 # cf = ConfigParser.ConfigParser() # cf.read(conf_fp) feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') feature_qp_names = Feature.get_feature_names_question_pair(cf) mc_indexs = FeatureProcessor.get_index_with_max_clique_size(cf, 'test', 4.) for f_name in feature_qp_names: feature_fp = '%s/%s.test.smat' % (feature_pt, f_name) feature_extra_fp = '%s/%s.train_extra.smat' % (feature_pt, f_name) has_extra = isfile(feature_extra_fp + ".npz") if not has_extra: features = Feature.load(feature_fp) features_extra = Feature.sample_row(features, mc_indexs) Feature.save_smat(features_extra, feature_extra_fp) LogUtil.log('INFO', '%s generate extra feature done' % f_name) else: LogUtil.log('INFO', '%s already has extra feature' % f_name)
def run_gen_feature_with_swap(cf, argv): """ 生成线下特征文件,包含swap部分 :return: """ # 读取配置文件 # cf = ConfigParser.ConfigParser() # cf.read(conf_fp) feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt') feature_qp_names = Feature.get_feature_names_question_pair(cf) rawset_name = argv[0] for f_name in feature_qp_names: feature_fp = '%s/%s.%s.smat' % (feature_pt, f_name, rawset_name) feature_swap_fp = '%s/%s.%s_swap.smat' % (feature_pt, f_name, rawset_name) feature_with_swap_fp = '%s/%s.%s_with_swap.smat' % (feature_pt, f_name, rawset_name) has_with_swap = isfile(feature_with_swap_fp + '.npz') if not has_with_swap: features = Feature.load(feature_fp) features_swap = Feature.load(feature_swap_fp) features_with_swap = Feature.merge_row(features, features_swap) Feature.save(features_with_swap, feature_with_swap_fp) LogUtil.log('INFO', '%s generate with_swap feature done' % f_name) else: LogUtil.log('INFO', '%s already has with_swap feature' % f_name)
def generate(config, argv): data_name = argv[0] LogUtil.log('INFO', 'data_name=%s' % data_name) # load data set if 'offline' == data_name: # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index_off) elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() else: source_data = None feature_file_path = '%s/instance_fs_length.%s.smat' % (config.get('DIRECTORY', 'dataset_pt'), data_name) feature_file = open(feature_file_path, 'w') feature_file.write('%d %d\n' % (len(source_data), 4)) for line in source_data: qid, tc, tw, dc, dw = parse_question_set(line) feature = list() feature.append(len(tc)) feature.append(len(tw)) feature.append(len(dc)) feature.append(len(dw)) Feature.save_feature(feature, feature_file) feature_file.close()
def load_all(feature_pt, feature_names, rawset_name, will_save=False): index_begin = 0 features = None for index in reversed(range(1, len(feature_names))): f_names_s = '|'.join( feature_names[0:index + 1]) + '|' + str(rawset_name) f_names_md5 = hashlib.md5(f_names_s.encode("utf8")).hexdigest() if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)): index_begin = index features = Feature.load('%s/md5_%s.smat' % (feature_pt, f_names_md5)) break LogUtil.log( 'INFO', 'load %s features [%s, %s)' % (rawset_name, feature_names[0], feature_names[index_begin])) if 1 > index_begin: features = Feature.load( '%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name)) for index in range(index_begin + 1, len(feature_names)): features = Feature.merge_col( features, Feature.load('%s/%s.%s.smat' % (feature_pt, feature_names[index], rawset_name))) features = features.tocsr() return features
def save_all_question2wids(): """ 将train.csv、test.csv语句转化为word_id列表 :return: """ LogUtil.log('INFO', 'BEGIN: save all question2wids') # 读取配置文件 cf = ConfigParser.ConfigParser() cf.read("../conf/python.conf") # 获取文件路径 qid2question_question_fp = '%s/qid2question.all.question' % cf.get('DEFAULT', 'devel_pt') w2id_fp = '/home/houjianpeng/BTM/output/train_100_50/voca.txt' all_question_wids_fp = '/home/houjianpeng/BTM/output/train_100_50/all_doc_wids.txt' # 加载词典 w2id = BTM.load_w2id(w2id_fp) all_question_f = open(qid2question_question_fp, 'r') all_question_wids_f = open(all_question_wids_fp, 'w') for line in all_question_f: ws = line.strip().split() wids = [w2id[w] for w in ws if w in w2id] print >> all_question_wids_f, ' '.join(map(str, wids)) all_question_f.close() all_question_wids_f.close() LogUtil.log('INFO', 'END: save all question2wids')
def load_npz(ft_fp): loader = np.load('%s.npz' % ft_fp) features = csr_matrix( (loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp) return features
def generate_batch(self, batch_size): n_batch = int(self.length / batch_size) if self.length % batch_size != 0: n_batch += 1 LogUtil.log('INFO','{} {}'.format(n_batch, batch_size)) slices = np.split(np.arange(n_batch * batch_size), n_batch) slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))] return slices
def merge_col(features_1, features_2): ''' 纵向合并特征矩阵,即为每个实例增加特征 ''' features = hstack([features_1, features_2]) (row_num, col_num) = features.shape LogUtil.log("INFO", "merge col done, shape=(%d,%d)" % (row_num, col_num)) return features
def generate(config, argv): # load valid dataset index valid_index_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index = DataUtil.load_vector(valid_index_fp, 'int') valid_index = [num - 1 for num in valid_index] # load topic btm vec topic_btm_vec = load_topic_btm_vec(config) # offline / online data_name = argv[0] dis_func_names = ["cosine", "cityblock", "jaccard", "canberra", "euclidean", "minkowski", "braycurtis"] btm_dis_feature_fn = ['vote_fs_btm_dis_%s' % dis_func_name for dis_func_name in dis_func_names] btm_dis_feature_f = [open('%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), fn, data_name), 'w') for fn in btm_dis_feature_fn] if 'offline' == data_name: btm_tw_cw_features = load_features_from_file(config, 'fs_btm_tw_cw', data_name, valid_index) LogUtil.log('INFO', 'load_features_from_file, len=%d' % len(btm_tw_cw_features)) for line_id in range(len(btm_tw_cw_features)): doc_vec = btm_tw_cw_features[line_id] for dis_id in range(len(dis_func_names)): vec = [0.] * 1999 for topic_id in range(1999): topic_vec = topic_btm_vec[topic_id] if 'minkowski' == dis_func_names[dis_id]: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3) else: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec) btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec])) else: btm_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), data_name) btm_vec_f = open(btm_vec_fp, 'r') for line in btm_vec_f: doc_vec = np.nan_to_num(parse_feature_vec(line)) for dis_id in range(len(dis_func_names)): vec = [0.] * 1999 for topic_id in range(1999): topic_vec = topic_btm_vec[topic_id] if 'minkowski' == dis_func_names[dis_id]: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3) else: vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec) btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec])) for f in btm_dis_feature_f: f.close()
def load_index(fp): ''' 加载特征索引文件 ''' f = open(fp) indexs = [int(line) for line in f.readlines()] LogUtil.log("INFO", "load index done, len(index)=%d" % (len(indexs))) f.close() return indexs
def get_labels(df): """ Get labels of data set :param df: original data set :return: label list of data set """ labels = df['is_duplicate'].tolist() LogUtil.log("INFO", "num(1)=%d, num(0)=%d" % (sum(labels), len(labels) - sum(labels))) return labels
def sample_row(features, indexs): ''' 根据索引行采样 ''' features_sampled = features[indexs, :] (row_num, col_num) = features_sampled.shape LogUtil.log("INFO", "row sample done, shape=(%d,%d)" % (row_num, col_num)) return features_sampled
def stat_dul_question(df): """ Make statistics to duplication of questions :param df: original data set :return: none """ questions = df['question1'].tolist() + df['question2'].tolist() len_questions = len(questions) len_uniq_questions = len(set(questions)) LogUtil.log("INFO", "len(questions)=%d, len(unique_questions)=%d, rate=%f" % ( len_questions, len_uniq_questions, 1.0 * len_uniq_questions / len_questions))
def test_mytaobao(self): driver = self.driver self.test_login(driver) try: main_page = page.MainPage(driver) main_page.goto_profile_page() except Exception as e: self.testCaseInfo.errorinfo = repr(e) LogUtil.log(('Got error: ' + repr(e))) else: self.testCaseInfo.result = 'Pass'
def init_powerful_word_oside(pword, thresh_num, thresh_rate): pword_oside = [] pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword) pword_oside.extend( map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword))) LogUtil.log( 'INFO', 'One side power words(%d): %s' % (len(pword_oside), str(pword_oside))) return pword_oside
def setUp(self): self.driver = webdriver.Chrome() self.base_url = 'http://www.taobao.com' self.testCaseInfo = TestCaseInfo(id='1', name=self.__str__(), owner='Oliver') self.testReport = TestReport() LogUtil.create_logger_file(__name__) self.testCaseInfo.starttime = common.get_current_time() LogUtil.log('Open base url: %s' % self.base_url)
def run(cf, argv): cmd = argv[0] if 'run_gen_feature_with_swap' == cmd: FeatureProcessor.run_gen_feature_swap(cf, argv[1:]) FeatureProcessor.run_gen_feature_with_swap(cf, argv[1:]) elif 'run_gen_feature_extra' == cmd: FeatureProcessor.run_gen_feature_extra(cf) elif 'run_gen_feature_with_extra' == cmd: FeatureProcessor.run_gen_feature_with_extra(cf, argv[1:]) else: LogUtil.log('WARNING', 'NO CMD (%s)' % cmd)
def generateBOW(df_features,vocab_size): now = datetime.datetime.now() print(now.strftime('%Y-%m-%d %H:%M:%S')) LogUtil.log("INFO", "Start to generate attribute BOW!") BagOfWordsExtractor = CountVectorizer(max_features=vocab_size, analyzer='word', lowercase=True) bow_features = BagOfWordsExtractor.fit_transform(df_features) print(now.strftime('%Y-%m-%d %H:%M:%S')) LogUtil.log("INFO", "End to generate attribute BOW!") return bow_features.toarray()
def entropy_loss(labels, preds): epsilon = 1e-15 s = 0. for idx, l in enumerate(labels): assert l == 1 or l == 0 score = preds[idx] score = max(epsilon, score) score = min(1 - epsilon, score) s += - l * math.log(score) - (1. - l) * math.log(1 - score) s /= len(labels) LogUtil.log('INFO', 'Entropy loss : %f' % (s)) return s
def merge_col(features_1, features_2): """ merge features made split by column :param features_1: the first part of features :param features_2: the second part of features :return: feature matrix """ features = hstack([features_1, features_2]) (row_num, col_num) = features.shape LogUtil.log("INFO", "merge col done, shape=(%d,%d)" % (row_num, col_num)) return features
def test_repeat_next(self): try: driver = self.driver main_page = page.MainPage(driver) main_page.open(self.base_url) assert 'Selenium with Python' in main_page.page_source() main_page.repeat_next() except Exception as e: self.testCaseInfo.errorinfo = repr(e) LogUtil.log(('Got error: ' + repr(e))) else: self.testCaseInfo.result = 'Pass'
def setUp(self): self.driver = webdriver.Chrome() # self.driver = webdriver.PhantomJS(service_args=SERVICE_ARGS) self.base_url = "http://selenium-python.readthedocs.io/" self.testCaseInfo = TestCaseInfo(id='3', name=self.__str__(), owner='Oliver') self.testReport = TestReport() LogUtil.create_logger_file(__name__) self.testCaseInfo.starttime = common.get_current_time() LogUtil.log('Open base url: %s' % self.base_url)
def merge_row(features_1, features_2): """ 横向合并特征矩阵,即合并两份数据集 :param feature_1: :param feature_2: :return: """ features = vstack([features_1, features_2]) (row_num, col_num) = features.shape LogUtil.log("INFO", "merge row done, shape=(%d,%d)" % (row_num, col_num)) return features
def sample_col(features, indexs): """ 根据索引列采样 :param features: :param indexs: :return: """ features_sampled = features[:, indexs] (row_num, col_num) = features_sampled.shape LogUtil.log("INFO", "col sample done, shape=(%d,%d)" % (row_num, col_num)) return features_sampled
def train_sgcn(hidden_size, label_size, n_nodes, sgcn_layer, id_list, all_label, batch_size, model_path, step_save_model=50, lr=0.001, epoch=10, window=4, gpu_id=0): path = '' import tensorflow as tf with tf.device('/cpu:0'): #with tf.device('/device:GPU:%d' % gpu_id): graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=False, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): model = SGCN(hidden_size, n_nodes, label_size, sgcn_layer, 0) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(lr) grads_and_vars = optimizer.compute_gradients(model.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) best_eval_accuracy = 0.0 saver = tf.train.Saver(tf.global_variables(), max_to_keep=3) init = tf.global_variables_initializer() sess.run(init) for epoch in range(epoch): LogUtil.log("INFO", "epoch:{}".format(epoch)) LogUtil.log("INFO", "start training: {}".format(datetime.datetime.now())) x_train_class = SGCNData(id_list, all_label, window) slices = x_train_class.generate_batch(batch_size) for step in range(len(slices)): LogUtil.log('INFO','Training at step:{} '.format(step)) i = slices[step] alias_inputs, A, items, node_masks, targets = x_train_class.get_slice(i) targets_onehot = label_to_onehot(targets, label_size) feed_dict = { model.items: items, model.A: A, model.alias_input: alias_inputs, model.node_masks: node_masks, model.labels: targets_onehot, model.dropout: 0.5 } _, step, loss, accuracy = sess.run([train_op, global_step, model.loss, model.acc], feed_dict) current_step = tf.train.global_step(sess, global_step) if current_step % step_save_model == 0: time_str = datetime.datetime.now().isoformat() LogUtil.log('INFO', "{}: Training step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) if accuracy > best_eval_accuracy: best_eval_accuracy = accuracy path = saver.save(sess, model_path, global_step=current_step) LogUtil.log('INFO',"Saved model checkpoint to {}\n".format(path)) return path
def generate(config, argv): data_name = argv[0] word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt') with open(word_idf_fp, 'r') as word_idf_f: word_idf = json.load(word_idf_f) LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf)) char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt') with open(char_idf_fp, 'r') as char_idf_f: char_idf = json.load(char_idf_f) LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf)) # load data set if 'offline' == data_name: # load offline valid dataset index valid_index_off_fp = '%s/%s.offline.index' % ( config.get('DIRECTORY', 'index_pt'), config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn')) valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int') valid_index_off = [num - 1 for num in valid_index_off] source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt' source_data = load_raw_line_from_file(config, source_file_path, valid_index_off) features = valid_index_off elif 'online' == data_name: source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt' source_data = open(source_file_path, 'r').readlines() features = range(len(source_data)) else: source_data = None features = None id_feature_file_path = '%s/instance_fs_id.%s.smat' % (config.get( 'DIRECTORY', 'dataset_pt'), data_name) feature_file = open(id_feature_file_path, 'w') feature_file.write('%d %d\n' % (len(source_data), 1)) for id_num in features: feature = list() feature.append(id_num % 100000) Feature.save_feature(feature, feature_file) feature_file.close()