def data_to_padding_ids(text_list): data_process = DataProcess(use_word2cut=True) enc_vocab = data_process.read_vocabulary(data_process.enc_vocab_file) enc_padding_ids_list = [] for text in text_list: words = data_process.text_cut_object.cut([text.strip()]) words_list = words[0].strip().split() enc_ids = [ enc_vocab.get(word, data_process.__UNK__) for word in words_list ] if len(enc_ids) > data_process.enc_input_length: enc_ids = enc_ids[:data_process.enc_input_length] enc_length = len(enc_ids) enc_padding_ids = [] enc_padding_ids.extend([0] * (data_process.enc_input_length - enc_length)) enc_padding_ids.extend( [int(enc_ids[enc_length - l - 1]) for l in range(enc_length)]) enc_padding_ids_list.append(np.array(enc_padding_ids)) return np.array(enc_padding_ids_list)
def run1(): # get data data_process = DataProcess() X, Y = data_process.get_all_data() X_train, Y_train = data_process.get_train_data() X_test, Y_test = data_process.get_test_data() # build model tfidf_nn = TFIDFNNClassifier(X) tfidf_nn.fit(X_train, Y_train, batch_size=128, epochs=10) tfidf_nn.save_model("../model/tfidf_nn_model_epoch10.h5") # convert character to numeric X_test = tfidf_nn.convert(X_test) # preict and evaluate prob = tfidf_nn.predict_prob(X_test) auc = tfidf_nn.auc(Y_test, prob) pred = tfidf_nn.predict(X_test) acc = tfidf_nn.accuracy(Y_test, pred) f1 = tfidf_nn.f1(Y_test, pred) cm = tfidf_nn.confusion_matrix(Y_test, pred) print "the accuracy is : " + str(acc) print "the auc is : " + str(auc) print "the f1 score is : " + str(f1) print "the confusion matrix is : \n" print cm
def save_pickle_data(filename, pickle_filename): data_process = DataProcess(filename) dialog_iter = data_process.create_dialogue_iter( data_process.input_file_path) index = 0 with open(pickle_filename, 'wb') as f_handle: while True: # data -> (context, utterances, target_id) data = next(dialog_iter, None) if data is None: break context, utterances, target_id = data tokenized_context, _ = data_process.tokenize(context) tokenized_utterances, _ = data_process.tokenize(utterances) save_data = [tokenized_context, tokenized_utterances] pickle.dump(save_data, f_handle) index += 1 if index % 100 == 0: print(index) print("%s data save complete!" % index)
def run1(): # get data data_process = DataProcess() X, Y = data_process.get_all_data() X_train, Y_train = data_process.get_train_data() X_test, Y_test = data_process.get_test_data() # build model lstm_classifier = LSTMClassifier(X) lstm_classifier.fit(X_train, Y_train, batch_size=128, epochs=1) lstm_classifier.save_model("../model/lstm_model_epoch1.h5") # convert character to numeric X_test = lstm_classifier.convert(X_test) # predict and evaluate prob = lstm_classifier.predict_prob(X_test) auc = lstm_classifier.auc(Y_test, prob) pred = lstm_classifier.predict(X_test) acc = lstm_classifier.accuracy(Y_test, pred) f1 = lstm_classifier.f1(Y_test, pred) cm = lstm_classifier.confusion_matrix(Y_test, pred) print "the accuracy is : " + str(acc) print "the auc is : " + str(auc) print "the f1 score is : " + str(f1) print "the confusion matrix is : \n" print cm
def process_factor(self, db, measure, factor, direction, fillna, check_rp, neutralize): if check_rp: factor_df = \ self.influx.getDataMultiprocess(db, measure, self.start, self.end, ['code', factor, 'report_period']) factor_df = DataProcess.check_report_period(factor_df) else: factor_df = self.influx.getDataMultiprocess(db, measure, self.start, self.end, ['code', factor]) factor_df.index.names = ['date'] factor_df.reset_index(inplace=True) if direction == -1: factor_df[factor] = factor_df[factor] * -1 # 缺失的因子用行业中位数填充 if fillna == 'median': factor_df = pd.merge(factor_df, self.code_range.reset_index(), how='right', on=['date', 'code']) factor_df[factor] = factor_df.groupby(['date', 'industry'])[factor].apply(lambda x: x.fillna(x.median())) # 缺失的因子用0填充 elif fillna == 'zero': factor_df = pd.merge(factor_df, self.code_range.reset_index(), how='right', on=['date', 'code']) factor_df[factor] = factor_df[factor].fillna(0) else: factor_df = pd.merge(factor_df, self.code_range.reset_index(), how='inner', on=['date', 'code']) factor_df = factor_df.dropna(subset=[factor]) factor_df.set_index('date', inplace=True) # 进行 中性化 / 标准化 if neutralize: factor_df = DataProcess.neutralize_v2(factor_df, factor, self.risk_exp, [], self.n_jobs) else: factor_df = factor_df.loc[:, ['code', factor]] factor_df = DataProcess.standardize(factor_df, factor, True, self.n_jobs) return factor_df
def get_vocabs(filename, type="train"): data_process = DataProcess(filename, type) dialog_iter = data_process.dialog_iter vocab = set() index = 0 while True: # data -> (context, utterances, target_id) data = next(dialog_iter, None) if data is None: break example_id, speaker, context, utterances, target_id, candidates_id = data tokenized_context, _ = data_process.tokenize(context) tokenized_utterances, _ = data_process.tokenize(utterances) for context_sentence in tokenized_context: for i, c_word in enumerate(context_sentence): context_sentence[i] = c_word.lower() vocab.update(context_sentence) for utterances_sentence in tokenized_utterances: for i, u_word in enumerate(utterances_sentence): utterances_sentence[i] = u_word.lower() vocab.update(utterances_sentence) index += 1 if index % 100 == 0: print(index, len(vocab)) return vocab
def run(): batch_size = 63 epochs = 5000 data_process = DataProcess(use_word2cut=False) model = build_model() documents_length = data_process.get_documents_size(data_process.enc_ids_file, data_process.dec_ids_file) if batch_size > documents_length: print("ERROR--->" + u"语料数据量过少,请再添加一些") return None #自适应学习率 reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=1e-6, mode='min') '''monitor: 需要监视的量,val_loss,val_acc patience: 当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练 verbose: 信息展示模式 mode: 'auto','min','max'之一,在min模式训练,如果检测值停止下降则终止训练。在max模式下,当检测值不再上升的时候则停止训练。''' early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=2) model.fit_generator(generator=generate_batch(batch_size=batch_size), steps_per_epoch=int(documents_length / batch_size)+5, \ validation_data=generate_batch(batch_size=batch_size), \ validation_steps=int(documents_length / batch_size)+5,\ epochs=epochs, verbose=1, workers=2, use_multiprocessing=True, callbacks=[reduce_lr,early_stopping]) model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
def process_partition(iterator): # 通过配置文件初始化DataProcess, DataProcess 主要是初始化hbase client conf = config.Config("process.conf") dp = DataProcess(conf) for line in iterator: dp.get_default("aaaaaaaa") result_lines = line.split(' ') for item in result_lines: yield item
def cal_marginal_ROE(self): # get net profit net_profit = self.influx.getDataMultiprocess( 'FinancialReport_Gus', 'net_profit_TTM', self.start, self.end, [ 'code', 'report_period', 'net_profit_TTM', 'net_profit_TTM_last1Q', 'net_profit_TTM_last4Q', 'net_profit_TTM_last5Q' ]) net_profit.index.names = ['date'] net_profit.reset_index(inplace=True) for i in [0, 1, 4, 5]: cur_rps = [] former_rps = [] for rp in net_profit['report_period'].unique(): cur_rps.append(rp) former_rps.append(DataProcess.get_former_RP(rp, i)) rp_dict = dict(zip(cur_rps, former_rps)) net_profit['profit_last{0}Q_rp'.format( i)] = net_profit['report_period'].map(rp_dict) net_profit.drop('report_period', axis=1, inplace=True) # get net equity net_equity = self.influx.getDataMultiprocess( 'FinancialReport_Gus', 'net_equity', self.start, self.end, [ 'code', 'report_period', 'net_equity', 'net_equity_last4Q', 'net_equity_last8Q' ]) net_equity.index.names = ['date'] net_equity.reset_index(inplace=True) net_equity['former_equity'] = (net_equity['net_equity_last4Q'] + net_equity['net_equity_last8Q']) / 2 net_equity['later_equity'] = (net_equity['net_equity'] + net_equity['net_equity_last4Q']) / 2 cur_rps = [] former_rps = [] for rp in net_equity['report_period'].unique(): cur_rps.append(rp) former_rps.append(DataProcess.get_former_RP(rp, 4)) rp_dict = dict(zip(cur_rps, former_rps)) net_equity['equity_last4Q_rp'.format( i)] = net_equity['report_period'].map(rp_dict) # -------------------------------------------------------------------------------------- ROE_df = pd.merge(net_profit, net_equity, how='outer', on=['date', 'code']) ROE_df = ROE_df.sort_values(['date', 'code', 'report_period']) codes = ROE_df['code'].unique() split_codes = np.array_split(codes, self.n_jobs) with parallel_backend('multiprocessing', n_jobs=self.n_jobs): res = Parallel()(delayed(MarginalROE.JOB_factors)( codes, ROE_df, self.db, self.measure) for codes in split_codes) print('marginal ROE finish') print('-' * 30) fail_list = [] for r in res: fail_list.extend(r) return fail_list
def classify_data(cls, data, _type=""): """ classify the data """ data_process = DataProcess() data = data_process.classification_data(data=data, _type=_type) return data
def merge(cls, df1, df2, on, how="left"): """ make two table merge to one table """ data_process = DataProcess() data = data_process.merge(df1, df2, on=on, how=how) return data
def populateButtonClick(self): invalid = self.GetInvalidData() process = DataProcess(logf=self.logEntry.get(), csvf=self.csvEntry.get(), ontof=self.ontoEntry.get(), periodf=self.periodEntry.get(), invalid=invalid, sender=self) process.Execute() return True
def new_table(cls, data_table, columns): """ to get a new table from another table """ data_process = DataProcess() data_table = data_process.new_table(data_table, columns) return data_table
def get_student_data(cls, clean_data): """ let data classify by student """ data_process = DataProcess() student_list = data_process.get_student_data(clean_data) return student_list
def get_student_list(self): ld = Loader() data_processing = DataProcess() course_initialization = Course_Initialization() clean_data = ld.load_csv( course_initialization.config.get_clean_data_csv_path()) student_list = data_processing.get_student_data(clean_data) return student_list
def run_evaluate(self, sess, type, data_path, test_case=1): data_process = DataProcess(self.hparams, data_path, type, word2id=self.word2id, test_case=test_case) k_list = self.hparams.recall_k_list total_examples = 0 total_correct = np.zeros([len(k_list)], dtype=np.int32) total_mrr = 0 index = 0 while True: batch_data = data_process.get_batch_data( self.hparams.dev_batch_size, 100) if batch_data is None: break (context, _), (utterances, _), _, _, _, example_id, candidates_id = batch_data pred_val, _ = sess.run([self.predictions, self.logits], feed_dict=self.make_feed_dict( batch_data, 1.0)) pred_val = np.asarray(pred_val) num_correct, num_examples = evaluate_recall( pred_val, batch_data[2], k_list) total_mrr += mean_reciprocal_rank(pred_val, batch_data[2]) total_examples += num_examples total_correct = np.add(total_correct, num_correct) if num_correct[5] != self.hparams.dev_batch_size: print(example_id, ":", index, num_correct[5]) index += 1 if index % 500 == 0: accumulated_accuracy = (total_correct / total_examples) * 100 print("index : ", index, " | ", accumulated_accuracy) avg_mrr = total_mrr / (self.hparams.dev_batch_size * index) recall_result = "" for i in range(len(k_list)): recall_result += "Recall@%s : " % k_list[i] + "%.2f%% | " % ( (total_correct[i] / total_examples) * 100) self._logger.info(recall_result) self._logger.info("MRR: %.4f" % avg_mrr) return k_list, (total_correct / total_examples) * 100, avg_mrr
def remove_columns(cls, data_table, columns, axis=1): """ remove a columns from table """ data_process = DataProcess() data_table = data_process.remove_columns(data_table=data_table, columns=columns, axis=axis) return data_table
def rename_columns(cls, data, re_columns, inplase=True): """ rename the colums name """ data_process = DataProcess() data = data_process.rename_columns(data=data, re_columns=re_columns, inplase=inplase) return data
def generate_real_embedding(text_list): data_process = DataProcess(use_word2cut=True) dec_vocab = data_process.read_vocabulary(data_process.dec_vocab_file) dec_padding_ids_list = [] for text in text_list: words = data_process.text_cut_object.cut([text.strip()]) words_list = words[0].strip().split() dec_ids = [ dec_vocab.get(word, data_process.__UNK__) for word in words_list ] if len(dec_ids) + 2 > data_process.dec_output_length: dec_ids = dec_ids[:data_process.dec_output_length - 2] dec_length = len(dec_ids) dec_padding_ids = [] dec_padding_ids.extend([data_process.__GO__]) dec_padding_ids.extend([int(dec_ids[l]) for l in range(dec_length)]) dec_padding_ids.extend([data_process.__EOS__]) dec_padding_ids.extend( [0] * (data_process.dec_output_length - dec_length - 2)) dec_padding_ids_list.append(np.array(dec_padding_ids)) padding_ids = np.array(dec_padding_ids_list) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = list(dec_vec_model.wv.vocab.keys()) dec_reverse_vec = data_process.read_reverse_vocabulary( data_process.dec_vocab_file) all_dec_embedding = [] for one_padding_ids in padding_ids: dec_embedding = [] for data in one_padding_ids: word = dec_reverse_vec[data] if word in dec_useful_words: word_embedding = dec_vec_model.wv[word] elif word == data_process.__VOCAB__[0]: word_embedding = np.zeros(data_process.dec_embedding_length) else: word_embedding = np.array([1.0] * data_process.dec_embedding_length) dec_embedding.append(word_embedding) all_dec_embedding.append(dec_embedding) return np.array(all_dec_embedding)
def make_valid_data(filename, write_file_name): data_process = DataProcess(filename) dialog_iter = data_process.create_dialogue_iter( data_process.input_file_path) input_sum_turn = 0 input_sum_sentence_len = 0 with open(write_file_name, "w", encoding='utf-8') as f_handle: index = 0 while True: index += 1 # data -> (context, utterances, target_id) data = next(dialog_iter, None) if data is None: break speakers, context, utterances, target_id = data context_sentence = context[0].split(" __eot__ ") f_handle.write("[%d]" % index + "\n") sum_sentence_len = 0 tot_turn = 0 for i, sentence in enumerate(context_sentence): sentence_len = len(nltk.word_tokenize(sentence)) if len(sentence) == 0: continue sentence_string = speakers[i] + " : " + sentence sentence_string = str(sentence_len) + "|" + sentence_string f_handle.write(sentence_string + "\n") sum_sentence_len += sentence_len tot_turn += 1 avg_sentence_len = sum_sentence_len / tot_turn sentence_answer = "Answer : " + utterances[target_id[0]] + "\n" f_handle.write(sentence_answer) f_handle.write("average sentence length : %.3f" % avg_sentence_len + "\n") f_handle.write("total turn number : %d" % tot_turn + '\n') f_handle.write("-" * 200 + "\n") if index % 500 == 0: print(index, ":", "avg_sentence_len - %.3f" % avg_sentence_len, "tot_turn - %d" % tot_turn) input_sum_turn += tot_turn input_sum_sentence_len += avg_sentence_len f_handle.write("average sentence length %.3f" % (input_sum_sentence_len / index)) f_handle.write("average turn length %.3f" % (input_sum_turn / index))
def monitor(mail_user, mail_pass, mail_to, mail_host): before = time.time() print("price monitor start.") dataprocess = DataProcess('test.db', 'goods') send_message = SendMessage() ret = dataprocess.sync_with_csv('in.csv') if ret is False: print("sync with csv file error.") return False # dataprocess.add_from_csv('in.csv') mail_content = "" html_parse = HtmlParse() ret, goods_data = dataprocess.get_goods() if ret is False: html_parse.driver.quit() return for x in goods_data: url = x['url'] price = x['price'] ret, data = html_parse.get_goods_data(url) if ret is False: continue price_now, goods_name = data print("price_now:" + str(price_now)) if price_now < price or price is None or price <= 0: ret = dataprocess.update_good( url, price_now, goods_name, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if ret is False: continue mail_content = mail_content + goods_name\ + "\n降价为:" + str(price_now) + "\n" if len(mail_content) != 0: mail_sub = "降价通知" mailto_list = [mail_to] mail_port = 465 send_message.mail_init(mail_host, mail_user, mail_pass, mail_port) send_message.send_mail(mailto_list, mail_sub, mail_content) else: print("There were no good which price has reduced.") html_parse.quit() # dataprocess.export_csv("out.csv") after = time.time() print('Thread ended, asumed time : %.2f' % (after - before))
def filter_data(cls, origin_log_data, field, field_value=''): """ filter value for a columns """ data_process = DataProcess() filter_data_done = data_process.filter_data( origin_log_data=origin_log_data, field=field, field_value=field_value) return filter_data_done
def test_get_min_spread_from_file_error_on_parse( self, mock_get_data, mock_log, obj_args): data_obj = DataProcess(*obj_args) mock_get_data.return_value = ( ' Team P W L D F A Pts\n' ' 1. Arsenal 38 26 9 3 79 - 36 87\n' ) data_obj.get_min_spread_from_file() mock_get_data.assert_called_once() mock_log.assert_called_once_with( 'Error on parsing the data - please check ' 'properties of DataProcess creation.', exc_info=True )
def __init__(self, conf, model_dir=".cache/model"): self.conf = conf self.data_processor = DataProcess() self.models = {} self.model_dir = model_dir con = self.data_processor.connect_db(conf.db_host, conf.db_database, conf.db_user, conf.db_pass) classes = self.data_processor.get_big_class(con) print(classes) for index, cls in classes.iterrows(): system = cls['business_system_code'] subclass = cls['rule_type_code'] self.init(system, subclass)
def uniq_values(self): _translate = QtCore.QCoreApplication.translate for f in self.files: filename = os.path.basename(f) try: data_proceser = DataProcess(f) datas = data_proceser.LB_data() if self.row_count == 0: try: for j, d in enumerate(datas): item = self.ResultTable.verticalHeaderItem(0) item.setText(_translate("Form", filename)) item = self.ResultTable.item(0, j) item.setText( _translate("Form", self.format_value(d))) item = self.ResultTable.item(0, j + 1) item.setText( _translate( "Form", '{}-{}'.format(config.B_low, config.B_high))) except: for j, d in enumerate(datas): self.ResultTable.setItem( 0, j, QtWidgets.QTableWidgetItem( self.format_value(d))) self.ResultTable.setItem( 0, j + 1, QtWidgets.QTableWidgetItem('{}-{}'.format( config.B_low, config.B_high))) self.ResultTable.setVerticalHeaderItem( 0, QtWidgets.QTableWidgetItem(filename)) else: rows = self.ResultTable.rowCount() self.ResultTable.insertRow(rows) for j, d in enumerate(datas): self.ResultTable.setItem( rows, j, QtWidgets.QTableWidgetItem(self.format_value(d))) self.ResultTable.setItem( rows, j + 1, QtWidgets.QTableWidgetItem('{}-{}'.format( config.B_low, config.B_high))) self.ResultTable.setVerticalHeaderItem( rows, QtWidgets.QTableWidgetItem(filename)) self.row_count += 1 except Exception as err: raise err
def suar_process(data_source, data_annotation_method, start_index, end_index, province_list): dp = DataProcess( '../data_preprocessed/suar/', data_annotation_method, data_source, 'https://www.sciencedirect.com/science/article/pii/S187705091832163X', 'suar', {}, {}, 0, 'corpus') for f in range(start_index, end_index): dp.save_file( data_source + '_' + str(f) + '.tsv', dp.preprocess(raw_folder + str(f) + '.txt', '', ','.join(province_list), 'Saudi', '', header=None))
def run(self): start_time = datetime.datetime.now() self.initialize_strategy() self.code_range = self.code_range.loc[self.code_range['industry'] == '银行(中信)'] self.code_range.reset_index(inplace=True) overall_factor = self.factors_combination() selection = self.select_codes(overall_factor) selection.to_csv(self.folder_dir + 'code_selection.csv', encoding='gbk') bm_stk_wgt = self.get_next_bm_stk_wgt() bm_stk_wgt.to_csv(self.folder_dir + 'bm_wgt.csv', encoding='gbk') # --------------------------backtest-------------------------------- bt_start = selection.index[0].strftime('%Y%m%d') bt_end = bm_stk_wgt.index[-1].strftime('%Y%m%d') QE = BacktestEngine(self.strategy_name, bt_start, bt_end, self.adj_interval, self.benchmark, stock_capital=self.capital) pvs = [] portfolio_value = QE.run(selection, bt_start, bt_end) portfolio_value = portfolio_value.loc[:, ['TotalValue']] portfolio_value.rename(columns={'TotalValue': 'AlphaBank'}, inplace=True) pvs.append(portfolio_value) QE.stk_portfolio.reset_portfolio(self.capital) portfolio_value = QE.run(bm_stk_wgt, bt_start, bt_end) portfolio_value = portfolio_value.loc[:, ['TotalValue']] portfolio_value.rename(columns={'TotalValue': 'BmBank'}, inplace=True) pvs.append(portfolio_value) banks_comparation = pd.concat(pvs, axis=1) banks_comparation['AccumAlpha'] = \ DataProcess.calc_accum_alpha(banks_comparation['AlphaBank'], banks_comparation['BmBank']) - 1 banks_comparation.to_csv(self.folder_dir + 'banks_comaration.csv', encoding='gbk') self.logger.info('Bank Comparation:') self.logger.info('-ANN_Alpha: %f' % DataProcess.calc_alpha_ann_return( banks_comparation['AlphaBank'], banks_comparation['BmBank'])) MDD, MDD_period = \ DataProcess.calc_alpha_max_draw_down(banks_comparation['AlphaBank'], banks_comparation['BmBank']) self.logger.info('-Alpha_MDD: %f' % MDD) self.logger.info('-Alpha_MDD period: %s - %s' % (MDD_period[0], MDD_period[1])) self.logger.info('-Alpha_sharpe: %f' % DataProcess.calc_alpha_sharpe( banks_comparation['AlphaBank'], banks_comparation['BmBank'])) print('Time used:', datetime.datetime.now() - start_time)
def predict_text(model, enc_embedding): data_process = DataProcess(use_word2cut=False) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = tuple(dec_vec_model.wv.vocab.keys()) prediction = model.predict_on_batch(enc_embedding) prediction_words_list = [] for elem in prediction: prediction_words = [] for vec in elem: dec_dis_list = [] mse = calculate_mse(vec, np.zeros(data_process.dec_embedding_length)) dec_dis_list.append(mse) for dec_word in dec_useful_words: mse = calculate_mse(vec, dec_vec_model.wv[dec_word]) dec_dis_list.append(mse) index = np.argmin(dec_dis_list) if index == 0: word = data_process.__VOCAB__[0] else: word = dec_useful_words[index - 1] prediction_words.append(word) prediction_words_list.append(prediction_words) return prediction_words_list
def predict_one_text(model, enc_embedding): data_process = DataProcess(use_word2cut=False) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = list(dec_vec_model.wv.vocab.keys()) prediction = model.predict(enc_embedding, verbose=0) prediction_words = [] for vec in prediction[0]: dec_dis_list = [] dec_dis = np.sqrt( np.sum( np.square(np.zeros(data_process.dec_embedding_length) - vec))) dec_dis_list.append(dec_dis) for dec_word in dec_useful_words: dec_dis = np.sqrt( np.sum(np.square(dec_vec_model.wv[dec_word] - vec))) dec_dis_list.append(dec_dis) index = np.argmin(dec_dis_list) if index == 0: word = data_process.__VOCAB__[0] else: word = dec_useful_words[index - 1] prediction_words.append(word) return prediction_words
def print_score(model, enc_embedding): data_process = DataProcess(use_word2cut=False) dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m') dec_useful_words = list(dec_vec_model.wv.vocab.keys()) prediction = model.predict(enc_embedding, verbose=0) score_words = [] for vec in prediction[0]: dec_sum = 0 dec_dis_list = [] dec_dis = np.sqrt( np.sum( np.square(np.zeros(data_process.dec_embedding_length) - vec))) dec_dis_list.append(dec_dis) dec_sum += dec_dis for dec_word in dec_useful_words: dec_dis = np.sqrt( np.sum(np.square(dec_vec_model.wv[dec_word] - vec))) dec_dis_list.append(dec_dis) dec_sum += dec_dis score_words.append(dec_dis_list / dec_sum) print(score_words)
import itertools import re from data_process import DataProcess from feature_extraction import FeatureExtraction from evaluation.svm import EvalSVM from evaluation.knn import EvalKnn from evaluation.tree import EvalTree from evaluation.nb import EvalNB from evaluation.logReg import EvalLogReg # Init the data process and feature extraction object data_process = DataProcess() feature_extraction = FeatureExtraction() data_content, data_lable = data_process.load_data('dataset/5000_seq.csv') data_process.extract_n_p_total(data_lable) # preprocess data processed_data = data_process.pre_process(data_content) processed_data = data_process.lemmatizer(processed_data) # pprint (processed_data) # vectorizer data vectorized_data = feature_extraction.tfidf_vectorizer(processed_data)