def nontopic_2_topic(): nonTopicData = readNonTopicText() resDict = {} topicData = readTopicData() for k, v in topicData.iteritems(): test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label) resDict[k] = res saveResult(res, saveAddr) return resDict
def topic_2_topic(): resDict = {} topicData = readTopicData() for k1, v1 in topicData.iteritems(): for k2, v2 in topicData.iteritems(): train_set, train_label, test_set, test_label = csv_to_train_test( v1, v2, ratio=4, times=10) res = classificationTest(train_set, train_label, test_set, test_label) resDict[k1 + '_' + k2] = res saveResult(res, saveAddr) return resDict
def run_indomain(): nonTopicData = readNonTopicText( addr='../data/non_topic/7500_nontopicTrain.txt') print 'nontopic indomain test' resDict_non2topic = {} count = 10 res = np.zeros(7) for i in xrange(0, count): train_set, train_label, test_set, test_label = csv_to_train_test( nonTopicData, nonTopicData, ratio=10, times=10) res += np.array( classificationTest(train_set, train_label, test_set, test_label, classifier=MultinomialNB())) print res / count datestr = datetime.datetime.now().strftime('%y_%m_%d_%H_%M_%S') allres = {'non2non': {'_avgRes_{}'.format(str(count)): res}} saveResult(allres, save_addr='../data/result/' + datestr + '_non2nonres.txt')
} allres = { 'resDict_non2topic': resDict_non2topic, 'resDict_mix': resDict_mix, 'resDict_transfer': resDict_transfer, 'topicAccuRes': topicAccuRes, 'avgRes': avgRes } print topicAccuRes.keys() for line in [[x[i] for x in topicAccuRes.values()] for i in range(0, len(topicAccuRes.values()[0]))]: print line print avgRes if not os.path.isdir(topic_addr + '/result'): os.mkdir(topic_addr + '/result') saveResult(allres, save_addr=topic_addr + '/result/' + datestr + '_allres.txt') #print resDict_transfer #saveResult(resDict_non2topic,saveAddr = topic_addr+'/result/'+'non2topic.txt') #saveResult(resDict_mix,saveAddr = topic_addr+'/result/'+str(size)+'_TASC_topic_mix.txt') #saveResult(resDict_transfer,saveAddr = topic_addr+'/result/'+str(size)+'_TASC_2_topic2.txt') #---------------------------- # k='twitter' # v=topicData[k] # resDict_tasc = {} # print 'test in topic "{}"'.format(k) # selected_num = len(v) * 4 # shortlist_num = selected_num * 4 # selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) # print 'len(selected_instances): ', len(selected_instances) # test_set, test_label = v['text'], v['label']
def main_expt( topic_addr='./data/topic/final', non_addr='./data/non_topic/nontopicTrain.txt', size=1500, instance_addr='./data/out_domain/10000_review_no3.txt.gz', vecModel_addr='./data/word_vector_data/word2vec_glove.twitter.27B.100d.txt' ): topicData = readTopicData(topic_addr) # ------------ non to topic nonTopicData = readNonTopicText(addr=non_addr) print 'nontopic test' resDict_non2topic = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label, classifier=MultinomialNB()) resDict_non2topic[k] = res # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径 tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr) # tasc.get_instance_TASC('apple',topicData['pure_dealed2016all'],10000,15000) print 'mix test' resDict_mix = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = pd.concat([ nonTopicData['text'], selected_instances['text'] ]), pd.concat([nonTopicData['label'], selected_instances['label']]) res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) # resDict_mix[k] = res print 'transfer test' resDict_transfer = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) resDict_transfer[k] = res print 'resDict_non2topic' print resDict_non2topic print 'resDict_transfer' print resDict_transfer print 'resDict_mix' print resDict_mix datestr = datetime.datetime.now().strftime('%y_%m_%d_%H_%M_%S') topicAccuRes = { '*topic': resDict_non2topic.keys(), 'resDict_non2topic': [v[0] for v in resDict_non2topic.values()], 'resDict_mix': [v[0] for v in resDict_mix.values()], 'resDict_transfer': [v[0] for v in resDict_transfer.values()] } def getAVGRes(x): return [ sum([v[i] for v in x.values()]) / len(topicData) for i in range(0, 7) ] avgRes = { 'resDict_non2topic': getAVGRes(resDict_non2topic), 'resDict_mix': getAVGRes(resDict_mix), 'resDict_transfer': getAVGRes(resDict_transfer) } allres = { 'resDict_non2topic': resDict_non2topic, 'resDict_mix': resDict_mix, 'resDict_transfer': resDict_transfer, 'topicAccuRes': topicAccuRes, 'avgRes': avgRes } print topicAccuRes.keys() for line in [[x[i] for x in topicAccuRes.values()] for i in range(0, len(topicAccuRes.values()[0]))]: print line print avgRes if not os.path.isdir(topic_addr + '/result'): os.mkdir(topic_addr + '/result') saveResult(allres, save_addr=topic_addr + '/result/' + datestr + '_allres.txt')
from sklearn.naive_bayes import MultinomialNB import pandas as pd import numpy as np import datetime import os from experiment.expt_util import readTopicData, readNonTopicText, csv_to_train_test, classificationTest, saveResult nonTopicData = readNonTopicText( addr='../data/non_topic/7500_nontopicTrain.txt') print 'nontopic indomain test' resDict_non2topic = {} count = 10 res = np.zeros(7) for i in xrange(0, count): train_set, train_label, test_set, test_label = csv_to_train_test( nonTopicData, nonTopicData, ratio=4, times=10) res += np.array( classificationTest(train_set, train_label, test_set, test_label, classifier=MultinomialNB())) res = res / count print res datestr = datetime.datetime.now().strftime('%y_%m_%d_%H_%M_%S') allres = {'non2non': {'avgRes_{}'.format(str(count)): res}} saveResult(allres, save_addr='../data/result/' + datestr + '_non2nonRes.txt')
def run(instance_addr='./data/out_domain/all_review_no3.txt.gz', vecModel_addr='./data/word_vector_data/word2vec_glove.twitter.27B.100d.txt', para_w=[1, 1, 1], save_addr='./data/result/TASC_2_topic.txt', topic_addr='./data/topic/final'): topicData = readTopicData_final() # ------------ non to topic nonTopicData = readNonTopicText(addr='./data/non_topic/nontopicTrain.txt') resDict_non2topic = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) resDict_non2topic[k] = res saveResult(resDict_non2topic, save_addr=topic_addr + '/result/' + 'non2topic.txt') # ------------ transfer # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径 tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr) resDict_tasc_mix = {} size = 1200 for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = pd.concat([ nonTopicData['text'], selected_instances['text'] ]), pd.concat([nonTopicData['label'], selected_instances['label']]) res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) # resDict_tasc_mix[k] = res saveResult(resDict_tasc_mix, save_addr=topic_addr + '/result/all/' + str(size) + '_TASC_topic_transfer.txt') #----------- resDict_tasc_trans = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) resDict_tasc_trans[k] = res saveResult(resDict_tasc_trans, save_addr=topic_addr + '/result/all/' + str(size) + '_TASC_topic_mix.txt') print resDict_non2topic print resDict_tasc_trans print resDict_tasc_mix