def evaluate(detail=False, d_type='re'): if d_type == 're': d_name = 'Restaurants' else: d_name = 'LapTops' print('加载并处理训练数据集') train_corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/%s_Train_v2.xml' % d_name) train_bio = preProcessing.createBIOClass(train_corpus.corpus, d_type) dep_path = 'dependences/%s_train.dep' % d_type train_bio.createDependenceFeature(dep_path) train_X, train_Y = train_bio.getFeaturesAndLabels() print('加载并处理测试数据集') test_corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/%s_Test_Data_phaseB.xml' % d_name) test_bio = preProcessing.createBIOClass(test_corpus.corpus, d_type) dep_path = 'dependences/%s_test.dep' % d_type test_bio.createDependenceFeature(dep_path) test_X, test_Y = test_bio.getFeaturesAndLabels() true_offsets = [] for i in range(len(test_bio.instances)): offset = [a.offset for a in test_bio.instances[i].aspect_terms] true_offsets.append(offset) origin_text_test = test_bio.origin_texts train_X, test_X = crfFormat_X(train_X, test_X) train_CRF(train_X, train_Y) predict_Y, tagger = tag_CRF(test_X) report = report_CRF(test_Y, predict_Y) print('\n--------结果报告如下(BIO基准)---------') print(report) if detail == True: print('\n--------其他关键信息(BIO基准)---------') info = tagger.info() print("可能性最高的状态转移:") print_transitions(Counter(info.transitions).most_common(10)) print("\n可能性最低的状态转移:") print_transitions(Counter(info.transitions).most_common()[-10:]) print("\n最强的特征关联:") print_state_features(Counter(info.state_features).most_common(10)) print("\n最弱的特征关联:") print_state_features(Counter(info.state_features).most_common()[-10:]) all_terms = [] for i in range(len(origin_text_test)): all_terms.append(getTermsFromYSeq(predict_Y[i], origin_text_test[i])) all_offsets = [] for i in range(len(origin_text_test)): all_offsets.append(getOffestFromText(all_terms[i], origin_text_test[i])) print('\n--------SemEval基准报告如下---------') semEvalValidate(all_offsets, true_offsets, b=1) return all_terms, all_offsets, origin_text_test, true_offsets
def createFeatureFileForPol(inputpath,deppath,outpath,context,d_type): print('加载数据') corpus=preProcessing.loadXML(inputpath) dep_list=preProcessing.loadDependenceInformation(deppath) instances=corpus.corpus aspectTermList=[] texts_list=[] aspectContextList=[] bio_entity=entity.BIO_Entity(instances,d_type) bio_entity.createPOSTags() bio_entity.createLemm() print('获取基础信息') for i in range(len(bio_entity.texts)): texts_list.append(bio_entity.texts[i]) aspectTermList.append(bio_entity.instances[i].aspect_terms) for i in range(len(texts_list)): for term in aspectTermList[i]: aspectContext=AspectContext() aspectContext.createBasic(term.term,term.pol,context) aspectContext.createContext(texts_list[i].split(' ')) aspectContext.createDepContext(dep_list[i]) if aspectContext.isvalid==True: aspectContextList.append(aspectContext) print('切分完成,开始保存') with open(outpath,'wb') as f: pickle.dump(aspectContextList,f) print('完成。')
def baselineExamForResturant(): corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/Restaurants_Train_v2.xml') print('开始切割数据集') train, seen = corpus.split() corpus.write_out('tmp/train.xml', train, short=False) traincorpus = entity.Corpus( ET.parse('tmp/train.xml').getroot().findall('sentence')) corpus.write_out('tmp/test.gold.xml', seen, short=False) seen = entity.Corpus( ET.parse('tmp/test.gold.xml').getroot().findall('sentence')) corpus.write_out('tmp/test.xml', seen.corpus) unseen = entity.Corpus( ET.parse('tmp/test.xml').getroot().findall('sentence')) print('开始统计和抽取AspectTerm') b1 = BaselineAspectExtractor(traincorpus) predicted = b1.tag(unseen.corpus) corpus.write_out('tmp/test.predicted-aspect.xml', predicted, short=False) print( 'P = %f -- R = %f -- F1 = %f (#correct: %d, #retrieved: %d, #relevant: %d)' % Evaluate(seen.corpus, predicted).aspect_extraction())
def trainingEmbedding(vector_len=150, d_type='re', add_extra=False): if d_type == 're': d_name = 'Restaurants' extraFile = 'data/extra/yelp/Restaurants_Raw.csv' else: d_name = 'LapTops' extraFile = 'data/extra/amzon/LapTops_Raw.csv' print('------训练%s数据的Word2Vec------' % d_name) train_corpus = preProcessing.loadXML( 'data/origin/ABSA_2014_origin/%s_Train_v2.xml' % d_name) test_corpus = preProcessing.loadXML( 'data/origin//ABSA_2014_origin/%s_Test_Data_PhaseA.xml' % d_name) print('数据集合并完成') corpus = train_corpus.corpus corpus = train_corpus.corpus + test_corpus.corpus del train_corpus del test_corpus bio_entity = entity.BIO_Entity(corpus, d_type) texts = bio_entity.texts if add_extra == True: print('添加额外语料:%s' % extraFile) extra_csv = pandas.read_csv(extraFile) extra_texts = list(extra_csv['text']) texts = texts + extra_texts del extra_csv del extra_texts print('额外语料加载完成') print('创建WordEmbedding') trainingForWords(texts, vector_len, 'model/%s.w2v' % d_name, 1, 0) print('创建WordEmbedding_CBOW') trainingForWords(texts, vector_len, 'model/%s.w2v_cbow' % d_name, 0, 0)
def trainingEmbedding(vector_len=150, d_type='re', add_extra=False, add_word=False, add_Lemm=False, add_POS=False): if d_type == 're': d_name = 'Restaurants' extraFile = '../../data/extra/yelp/Restaurants_Raw.csv' extraLemmFile = '../../data/extra/yelp/Restaurants_Lemm.csv' extraWPFile = '../../data/extra/yelp/Restaurants_WP.csv' else: d_name = 'LapTops' extraFile = '../../data/extra/amzon/LapTops_Raw.csv' extraLemmFile = '../../data/extra/yelp/Restaurants_Lemm.csv' extraWPFile = '../../data/extra/yelp/Restaurants_WP.csv' print('------训练%s数据的Word2Vec------' % d_name) train_corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/%s_Train_v2.xml' % d_name) test_corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/%s_Test_Data_PhaseA.xml' % d_name) print('数据集合并完成') corpus = train_corpus.corpus corpus = train_corpus.corpus + test_corpus.corpus del train_corpus del test_corpus bio_entity = entity.BIO_Entity(corpus, d_type) bio_entity.createPOSTags() bio_entity.createLemm() bio_entity.createWordPOS() texts = bio_entity.texts if add_word == True: if add_extra == True: print('添加额外语料:%s' % extraFile) extra_csv = pandas.read_csv(extraFile, encoding='gbk') extra_texts = list(extra_csv['text']) texts = texts + extra_texts del extra_csv del extra_texts print('额外语料加载完成') print('创建WordEmbedding') trainingForWords(texts, vector_len, 'embeddingModels/%s.w2v' % d_name, 1, 0) print('创建WordEmbedding_CBOW') trainingForWords(texts, vector_len, 'embeddingModels/%s.w2v_cbow' % d_name, 0, 0) if add_Lemm == True: vectors = [' '.join(lemm) for lemm in bio_entity.lemm_tags] if add_extra == True: print('添加额外Lemm语料:%s' % extraLemmFile) extra_csv = pandas.read_csv(extraLemmFile, encoding='gbk') extra_vectors = list(extra_csv['lemm']) vectors = vectors + extra_vectors del extra_csv del extra_vectors print('额外语料加载完成') print('创建LemmEmbedding') trainingForWords(vectors, vector_len, 'embeddingModels/%s.l2v' % d_name, 1, 0) print('创建LemmEmbedding_CBOW') trainingForWords(vectors, vector_len, 'embeddingModels/%s.l2v_cbow' % d_name, 0, 0) if add_POS == True: vectors = [' '.join(wordpos) for wordpos in bio_entity.word_pos] if add_extra == True: print('添加额外WP语料:%s' % extraWPFile) extra_csv = pandas.read_csv(extraWPFile, encoding='gbk') extra_vectors = list(extra_csv['wordpos']) vectors = vectors + extra_vectors del extra_csv del extra_vectors print('额外语料加载完成') print('创建WPEmbedding') trainingForWords(vectors, vector_len, 'embeddingModels/%s.p2v' % d_name, 1, 0) print('创建WPEmbedding_CBOW') trainingForWords(vectors, vector_len, 'embeddingModels/%s.p2v_cbow' % d_name, 0, 0)