Beispiel #1
0
def createFeatureFileForPol(inputpath,deppath,outpath,context,d_type):  
    print('加载数据')
    corpus=preProcessing.loadXML(inputpath)
    dep_list=preProcessing.loadDependenceInformation(deppath)
    instances=corpus.corpus
    
    aspectTermList=[]
    texts_list=[]
    aspectContextList=[]
    
    bio_entity=entity.BIO_Entity(instances,d_type)
    bio_entity.createPOSTags()
    bio_entity.createLemm()
    
    print('获取基础信息')
    for i in range(len(bio_entity.texts)):
        texts_list.append(bio_entity.texts[i])
        aspectTermList.append(bio_entity.instances[i].aspect_terms)
    
        
    for i in range(len(texts_list)):
        for term in aspectTermList[i]:
            aspectContext=AspectContext()
            aspectContext.createBasic(term.term,term.pol,context)
            aspectContext.createContext(texts_list[i].split(' '))
            aspectContext.createDepContext(dep_list[i])
            if aspectContext.isvalid==True:
                aspectContextList.append(aspectContext)
                
    print('切分完成,开始保存')
    with open(outpath,'wb') as f:
        pickle.dump(aspectContextList,f)
    print('完成。')
Beispiel #2
0
def createBIOClass(instances,d_type):
    bio_entity=entity.BIO_Entity(instances,d_type)
    bio_entity.createBIOTags()
    bio_entity.createPOSTags()
    bio_entity.createLemm()
    
    bio_entity.createW2VCluster()
    
    return bio_entity
Beispiel #3
0
def trainingEmbedding(vector_len=150, d_type='re', add_extra=False):
    if d_type == 're':
        d_name = 'Restaurants'
        extraFile = 'data/extra/yelp/Restaurants_Raw.csv'
    else:
        d_name = 'LapTops'
        extraFile = 'data/extra/amzon/LapTops_Raw.csv'

    print('------训练%s数据的Word2Vec------' % d_name)
    train_corpus = preProcessing.loadXML(
        'data/origin/ABSA_2014_origin/%s_Train_v2.xml' % d_name)
    test_corpus = preProcessing.loadXML(
        'data/origin//ABSA_2014_origin/%s_Test_Data_PhaseA.xml' % d_name)
    print('数据集合并完成')

    corpus = train_corpus.corpus
    corpus = train_corpus.corpus + test_corpus.corpus

    del train_corpus
    del test_corpus

    bio_entity = entity.BIO_Entity(corpus, d_type)
    texts = bio_entity.texts

    if add_extra == True:
        print('添加额外语料:%s' % extraFile)
        extra_csv = pandas.read_csv(extraFile)
        extra_texts = list(extra_csv['text'])
        texts = texts + extra_texts
        del extra_csv
        del extra_texts
        print('额外语料加载完成')

    print('创建WordEmbedding')
    trainingForWords(texts, vector_len, 'model/%s.w2v' % d_name, 1, 0)
    print('创建WordEmbedding_CBOW')
    trainingForWords(texts, vector_len, 'model/%s.w2v_cbow' % d_name, 0, 0)
Beispiel #4
0
def trainingEmbedding(vector_len=150,
                      d_type='re',
                      add_extra=False,
                      add_word=False,
                      add_Lemm=False,
                      add_POS=False):
    if d_type == 're':
        d_name = 'Restaurants'
        extraFile = '../../data/extra/yelp/Restaurants_Raw.csv'
        extraLemmFile = '../../data/extra/yelp/Restaurants_Lemm.csv'
        extraWPFile = '../../data/extra/yelp/Restaurants_WP.csv'
    else:
        d_name = 'LapTops'
        extraFile = '../../data/extra/amzon/LapTops_Raw.csv'
        extraLemmFile = '../../data/extra/yelp/Restaurants_Lemm.csv'
        extraWPFile = '../../data/extra/yelp/Restaurants_WP.csv'

    print('------训练%s数据的Word2Vec------' % d_name)
    train_corpus = preProcessing.loadXML(
        '../../data/ABSA_2014_origin/%s_Train_v2.xml' % d_name)
    test_corpus = preProcessing.loadXML(
        '../../data/ABSA_2014_origin/%s_Test_Data_PhaseA.xml' % d_name)
    print('数据集合并完成')
    corpus = train_corpus.corpus
    corpus = train_corpus.corpus + test_corpus.corpus

    del train_corpus
    del test_corpus
    bio_entity = entity.BIO_Entity(corpus, d_type)
    bio_entity.createPOSTags()
    bio_entity.createLemm()
    bio_entity.createWordPOS()

    texts = bio_entity.texts

    if add_word == True:
        if add_extra == True:
            print('添加额外语料:%s' % extraFile)
            extra_csv = pandas.read_csv(extraFile, encoding='gbk')
            extra_texts = list(extra_csv['text'])
            texts = texts + extra_texts
            del extra_csv
            del extra_texts
            print('额外语料加载完成')

        print('创建WordEmbedding')
        trainingForWords(texts, vector_len, 'embeddingModels/%s.w2v' % d_name,
                         1, 0)
        print('创建WordEmbedding_CBOW')
        trainingForWords(texts, vector_len,
                         'embeddingModels/%s.w2v_cbow' % d_name, 0, 0)

    if add_Lemm == True:
        vectors = [' '.join(lemm) for lemm in bio_entity.lemm_tags]
        if add_extra == True:
            print('添加额外Lemm语料:%s' % extraLemmFile)
            extra_csv = pandas.read_csv(extraLemmFile, encoding='gbk')
            extra_vectors = list(extra_csv['lemm'])
            vectors = vectors + extra_vectors
            del extra_csv
            del extra_vectors
            print('额外语料加载完成')

        print('创建LemmEmbedding')
        trainingForWords(vectors, vector_len,
                         'embeddingModels/%s.l2v' % d_name, 1, 0)
        print('创建LemmEmbedding_CBOW')
        trainingForWords(vectors, vector_len,
                         'embeddingModels/%s.l2v_cbow' % d_name, 0, 0)

    if add_POS == True:
        vectors = [' '.join(wordpos) for wordpos in bio_entity.word_pos]
        if add_extra == True:
            print('添加额外WP语料:%s' % extraWPFile)
            extra_csv = pandas.read_csv(extraWPFile, encoding='gbk')
            extra_vectors = list(extra_csv['wordpos'])
            vectors = vectors + extra_vectors
            del extra_csv
            del extra_vectors
            print('额外语料加载完成')

        print('创建WPEmbedding')
        trainingForWords(vectors, vector_len,
                         'embeddingModels/%s.p2v' % d_name, 1, 0)
        print('创建WPEmbedding_CBOW')
        trainingForWords(vectors, vector_len,
                         'embeddingModels/%s.p2v_cbow' % d_name, 0, 0)