def TASC_2_topic( instance_addr='../data/out_domain/10000_review_no3.txt.gz', vecModel_addr='../data/word_vector_data/word2vec_glove.twitter.27B.100d.txt', para_w=[1, 1, 1], saveAddr='../data/result/TASC_2_topic.txt', topic_addr='../data/topic'): topicData = readTopicData(topic_addr) tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr, para_w=para_w) resDict = {} for k, v in topicData.iteritems(): print 'test for topic ' + k selected_num = len(v) * 4 if len(v) > 100 else 400 shortlist_num = selected_num * 4 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=2, classifier=MultinomialNB()) resDict[k] = res if saveAddr != '': print resDict saveResult(resDict, saveAddr) return resDict
def TASC_2_topic(saveAddr='', para_w=[1, 1, 1]): tasc = TASC(para_w=para_w) resDict = {} topicData = readTopicData() for k, v in topicData.iteritems(): selected_num = len(v) * 4 shortlist_num = selected_num * 4 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label) resDict[k] = res if saveAddr != '': saveResult(res, saveAddr) return resDict
def mixdomain_2_topic(): nonTopicData = readNonTopicText() tasc = TASC() resDict = {} topicData = readTopicData() for k, v in topicData.iteritems(): selected_num = len(v) * 4 shortlist_num = selected_num * 4 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) selected_instances = pd.concat([selected_instances, nonTopicData], axis=0) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label) resDict[k] = res saveResult(resDict, saveAddr) return resDict
for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label, classifier=MultinomialNB()) resDict_non2topic[k] = res # ------------ transfer # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径 tasc = TASC( instance_addr='../data/out_domain/10000_review_no3.txt.gz', vecModel_addr='../data/word_vector_data/word2vec_glove.twitter.27B.100d.txt' ) # tasc.get_instance_TASC('apple',topicData['pure_dealed2016all'],10000,15000) from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB resDict_mix = {} size = 1500 for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num,
def main_expt( topic_addr='./data/topic/final', non_addr='./data/non_topic/nontopicTrain.txt', size=1500, instance_addr='./data/out_domain/10000_review_no3.txt.gz', vecModel_addr='./data/word_vector_data/word2vec_glove.twitter.27B.100d.txt' ): topicData = readTopicData(topic_addr) # ------------ non to topic nonTopicData = readNonTopicText(addr=non_addr) print 'nontopic test' resDict_non2topic = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label, classifier=MultinomialNB()) resDict_non2topic[k] = res # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径 tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr) # tasc.get_instance_TASC('apple',topicData['pure_dealed2016all'],10000,15000) print 'mix test' resDict_mix = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = pd.concat([ nonTopicData['text'], selected_instances['text'] ]), pd.concat([nonTopicData['label'], selected_instances['label']]) res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) # resDict_mix[k] = res print 'transfer test' resDict_transfer = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) resDict_transfer[k] = res print 'resDict_non2topic' print resDict_non2topic print 'resDict_transfer' print resDict_transfer print 'resDict_mix' print resDict_mix datestr = datetime.datetime.now().strftime('%y_%m_%d_%H_%M_%S') topicAccuRes = { '*topic': resDict_non2topic.keys(), 'resDict_non2topic': [v[0] for v in resDict_non2topic.values()], 'resDict_mix': [v[0] for v in resDict_mix.values()], 'resDict_transfer': [v[0] for v in resDict_transfer.values()] } def getAVGRes(x): return [ sum([v[i] for v in x.values()]) / len(topicData) for i in range(0, 7) ] avgRes = { 'resDict_non2topic': getAVGRes(resDict_non2topic), 'resDict_mix': getAVGRes(resDict_mix), 'resDict_transfer': getAVGRes(resDict_transfer) } allres = { 'resDict_non2topic': resDict_non2topic, 'resDict_mix': resDict_mix, 'resDict_transfer': resDict_transfer, 'topicAccuRes': topicAccuRes, 'avgRes': avgRes } print topicAccuRes.keys() for line in [[x[i] for x in topicAccuRes.values()] for i in range(0, len(topicAccuRes.values()[0]))]: print line print avgRes if not os.path.isdir(topic_addr + '/result'): os.mkdir(topic_addr + '/result') saveResult(allres, save_addr=topic_addr + '/result/' + datestr + '_allres.txt')
from final.feature_handle import Source from sentiment_classify_method import ngram_sa_method from experiment.expl_util import readTopicData, readNonTopicText, csv_to_train_test, classificationTest, saveResult topicData = readTopicData() # ------------ non to topic nonTopicData = readNonTopicText() resDict_non2topic = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label) resDict_non2topic[k] = res # ------------ transfer tasc = TASC( ) # tasc.get_instance_TASC('apple',topicData['pure_dealed2016all'],10000,15000) from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB resDict_tasc1 = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 4 if len(v) > 1000 else 4000 shortlist_num = selected_num * 4 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances['text'], selected_instances[
def run(instance_addr='./data/out_domain/all_review_no3.txt.gz', vecModel_addr='./data/word_vector_data/word2vec_glove.twitter.27B.100d.txt', para_w=[1, 1, 1], save_addr='./data/result/TASC_2_topic.txt', topic_addr='./data/topic/final'): topicData = readTopicData_final() # ------------ non to topic nonTopicData = readNonTopicText(addr='./data/non_topic/nontopicTrain.txt') resDict_non2topic = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) test_set, test_label = v['text'], v['label'] train_set, train_label = nonTopicData['text'], nonTopicData['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) resDict_non2topic[k] = res saveResult(resDict_non2topic, save_addr=topic_addr + '/result/' + 'non2topic.txt') # ------------ transfer # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径 tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr) resDict_tasc_mix = {} size = 1200 for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = pd.concat([ nonTopicData['text'], selected_instances['text'] ]), pd.concat([nonTopicData['label'], selected_instances['label']]) res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) # resDict_tasc_mix[k] = res saveResult(resDict_tasc_mix, save_addr=topic_addr + '/result/all/' + str(size) + '_TASC_topic_transfer.txt') #----------- resDict_tasc_trans = {} for k, v in topicData.iteritems(): print 'test in topic "{}"'.format(k) selected_num = len(v) * 5 if len(v) > size else size * 5 shortlist_num = selected_num * 2 selected_instances = tasc.get_instance_TASC(k, v, selected_num, shortlist_num) print 'len(selected_instances): ', len(selected_instances) test_set, test_label = v['text'], v['label'] train_set, train_label = selected_instances[ 'text'], selected_instances['label'] res = classificationTest(train_set, train_label, test_set, test_label, lowFreqK=10, classifier=MultinomialNB()) resDict_tasc_trans[k] = res saveResult(resDict_tasc_trans, save_addr=topic_addr + '/result/all/' + str(size) + '_TASC_topic_mix.txt') print resDict_non2topic print resDict_tasc_trans print resDict_tasc_mix