def createFeatureFileForPol(inputpath,deppath,outpath,context,d_type): print('加载数据') corpus=preProcessing.loadXML(inputpath) dep_list=preProcessing.loadDependenceInformation(deppath) instances=corpus.corpus aspectTermList=[] texts_list=[] aspectContextList=[] bio_entity=entity.BIO_Entity(instances,d_type) bio_entity.createPOSTags() bio_entity.createLemm() print('获取基础信息') for i in range(len(bio_entity.texts)): texts_list.append(bio_entity.texts[i]) aspectTermList.append(bio_entity.instances[i].aspect_terms) for i in range(len(texts_list)): for term in aspectTermList[i]: aspectContext=AspectContext() aspectContext.createBasic(term.term,term.pol,context) aspectContext.createContext(texts_list[i].split(' ')) aspectContext.createDepContext(dep_list[i]) if aspectContext.isvalid==True: aspectContextList.append(aspectContext) print('切分完成,开始保存') with open(outpath,'wb') as f: pickle.dump(aspectContextList,f) print('完成。')
def createBIOClass(instances,d_type): bio_entity=entity.BIO_Entity(instances,d_type) bio_entity.createBIOTags() bio_entity.createPOSTags() bio_entity.createLemm() bio_entity.createW2VCluster() return bio_entity
def trainingEmbedding(vector_len=150, d_type='re', add_extra=False): if d_type == 're': d_name = 'Restaurants' extraFile = 'data/extra/yelp/Restaurants_Raw.csv' else: d_name = 'LapTops' extraFile = 'data/extra/amzon/LapTops_Raw.csv' print('------训练%s数据的Word2Vec------' % d_name) train_corpus = preProcessing.loadXML( 'data/origin/ABSA_2014_origin/%s_Train_v2.xml' % d_name) test_corpus = preProcessing.loadXML( 'data/origin//ABSA_2014_origin/%s_Test_Data_PhaseA.xml' % d_name) print('数据集合并完成') corpus = train_corpus.corpus corpus = train_corpus.corpus + test_corpus.corpus del train_corpus del test_corpus bio_entity = entity.BIO_Entity(corpus, d_type) texts = bio_entity.texts if add_extra == True: print('添加额外语料:%s' % extraFile) extra_csv = pandas.read_csv(extraFile) extra_texts = list(extra_csv['text']) texts = texts + extra_texts del extra_csv del extra_texts print('额外语料加载完成') print('创建WordEmbedding') trainingForWords(texts, vector_len, 'model/%s.w2v' % d_name, 1, 0) print('创建WordEmbedding_CBOW') trainingForWords(texts, vector_len, 'model/%s.w2v_cbow' % d_name, 0, 0)
def trainingEmbedding(vector_len=150, d_type='re', add_extra=False, add_word=False, add_Lemm=False, add_POS=False): if d_type == 're': d_name = 'Restaurants' extraFile = '../../data/extra/yelp/Restaurants_Raw.csv' extraLemmFile = '../../data/extra/yelp/Restaurants_Lemm.csv' extraWPFile = '../../data/extra/yelp/Restaurants_WP.csv' else: d_name = 'LapTops' extraFile = '../../data/extra/amzon/LapTops_Raw.csv' extraLemmFile = '../../data/extra/yelp/Restaurants_Lemm.csv' extraWPFile = '../../data/extra/yelp/Restaurants_WP.csv' print('------训练%s数据的Word2Vec------' % d_name) train_corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/%s_Train_v2.xml' % d_name) test_corpus = preProcessing.loadXML( '../../data/ABSA_2014_origin/%s_Test_Data_PhaseA.xml' % d_name) print('数据集合并完成') corpus = train_corpus.corpus corpus = train_corpus.corpus + test_corpus.corpus del train_corpus del test_corpus bio_entity = entity.BIO_Entity(corpus, d_type) bio_entity.createPOSTags() bio_entity.createLemm() bio_entity.createWordPOS() texts = bio_entity.texts if add_word == True: if add_extra == True: print('添加额外语料:%s' % extraFile) extra_csv = pandas.read_csv(extraFile, encoding='gbk') extra_texts = list(extra_csv['text']) texts = texts + extra_texts del extra_csv del extra_texts print('额外语料加载完成') print('创建WordEmbedding') trainingForWords(texts, vector_len, 'embeddingModels/%s.w2v' % d_name, 1, 0) print('创建WordEmbedding_CBOW') trainingForWords(texts, vector_len, 'embeddingModels/%s.w2v_cbow' % d_name, 0, 0) if add_Lemm == True: vectors = [' '.join(lemm) for lemm in bio_entity.lemm_tags] if add_extra == True: print('添加额外Lemm语料:%s' % extraLemmFile) extra_csv = pandas.read_csv(extraLemmFile, encoding='gbk') extra_vectors = list(extra_csv['lemm']) vectors = vectors + extra_vectors del extra_csv del extra_vectors print('额外语料加载完成') print('创建LemmEmbedding') trainingForWords(vectors, vector_len, 'embeddingModels/%s.l2v' % d_name, 1, 0) print('创建LemmEmbedding_CBOW') trainingForWords(vectors, vector_len, 'embeddingModels/%s.l2v_cbow' % d_name, 0, 0) if add_POS == True: vectors = [' '.join(wordpos) for wordpos in bio_entity.word_pos] if add_extra == True: print('添加额外WP语料:%s' % extraWPFile) extra_csv = pandas.read_csv(extraWPFile, encoding='gbk') extra_vectors = list(extra_csv['wordpos']) vectors = vectors + extra_vectors del extra_csv del extra_vectors print('额外语料加载完成') print('创建WPEmbedding') trainingForWords(vectors, vector_len, 'embeddingModels/%s.p2v' % d_name, 1, 0) print('创建WPEmbedding_CBOW') trainingForWords(vectors, vector_len, 'embeddingModels/%s.p2v_cbow' % d_name, 0, 0)