Esempio n. 1
0
def TASC_2_topic(
        instance_addr='../data/out_domain/10000_review_no3.txt.gz',
        vecModel_addr='../data/word_vector_data/word2vec_glove.twitter.27B.100d.txt',
        para_w=[1, 1, 1],
        saveAddr='../data/result/TASC_2_topic.txt',
        topic_addr='../data/topic'):
    topicData = readTopicData(topic_addr)
    tasc = TASC(instance_addr=instance_addr,
                vecModel_addr=vecModel_addr,
                para_w=para_w)
    resDict = {}
    for k, v in topicData.iteritems():
        print 'test for topic ' + k
        selected_num = len(v) * 4 if len(v) > 100 else 400
        shortlist_num = selected_num * 4
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = selected_instances[
            'text'], selected_instances['label']
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 lowFreqK=2,
                                 classifier=MultinomialNB())
        resDict[k] = res
    if saveAddr != '':
        print resDict
        saveResult(resDict, saveAddr)
    return resDict
def TASC_2_topic(saveAddr='', para_w=[1, 1, 1]):
    tasc = TASC(para_w=para_w)
    resDict = {}
    topicData = readTopicData()
    for k, v in topicData.iteritems():
        selected_num = len(v) * 4
        shortlist_num = selected_num * 4
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = selected_instances[
            'text'], selected_instances['label']
        res = classificationTest(train_set, train_label, test_set, test_label)
        resDict[k] = res
    if saveAddr != '':
        saveResult(res, saveAddr)
    return resDict
Esempio n. 3
0
def mixdomain_2_topic():
    nonTopicData = readNonTopicText()
    tasc = TASC()
    resDict = {}
    topicData = readTopicData()
    for k, v in topicData.iteritems():
        selected_num = len(v) * 4
        shortlist_num = selected_num * 4
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        selected_instances = pd.concat([selected_instances, nonTopicData],
                                       axis=0)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = selected_instances[
            'text'], selected_instances['label']
        res = classificationTest(train_set, train_label, test_set, test_label)
        resDict[k] = res
    saveResult(resDict, saveAddr)
    return resDict
Esempio n. 4
0
    vecModel_addr='../data/word_vector_data/word2vec_glove.twitter.27B.100d.txt'
)

# tasc.get_instance_TASC('apple',topicData['pure_dealed2016all'],10000,15000)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

resDict_mix = {}
size = 1500
for k, v in topicData.iteritems():
    print 'test in topic "{}"'.format(k)
    selected_num = len(v) * 5 if len(v) > size else size * 5
    shortlist_num = selected_num * 2
    selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                shortlist_num)
    print 'len(selected_instances): ', len(selected_instances)
    test_set, test_label = v['text'], v['label']
    train_set, train_label = pd.concat([
        nonTopicData['text'], selected_instances['text']
    ]), pd.concat([nonTopicData['label'], selected_instances['label']])
    res = classificationTest(train_set,
                             train_label,
                             test_set,
                             test_label,
                             lowFreqK=10,
                             classifier=MultinomialNB())  #
    resDict_mix[k] = res

resDict_transfer = {}
for k, v in topicData.iteritems():
Esempio n. 5
0
def main_expt(
    topic_addr='./data/topic/final',
    non_addr='./data/non_topic/nontopicTrain.txt',
    size=1500,
    instance_addr='./data/out_domain/10000_review_no3.txt.gz',
    vecModel_addr='./data/word_vector_data/word2vec_glove.twitter.27B.100d.txt'
):
    topicData = readTopicData(topic_addr)
    # ------------ non to topic
    nonTopicData = readNonTopicText(addr=non_addr)
    print 'nontopic test'
    resDict_non2topic = {}
    for k, v in topicData.iteritems():
        print 'test in topic "{}"'.format(k)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = nonTopicData['text'], nonTopicData['label']
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 classifier=MultinomialNB())
        resDict_non2topic[k] = res

    # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径

    tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr)

    # tasc.get_instance_TASC('apple',topicData['pure_dealed2016all'],10000,15000)
    print 'mix test'
    resDict_mix = {}

    for k, v in topicData.iteritems():
        print 'test in topic "{}"'.format(k)
        selected_num = len(v) * 5 if len(v) > size else size * 5
        shortlist_num = selected_num * 2
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        print 'len(selected_instances): ', len(selected_instances)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = pd.concat([
            nonTopicData['text'], selected_instances['text']
        ]), pd.concat([nonTopicData['label'], selected_instances['label']])
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 lowFreqK=10,
                                 classifier=MultinomialNB())  #
        resDict_mix[k] = res
    print 'transfer test'
    resDict_transfer = {}
    for k, v in topicData.iteritems():
        print 'test in topic "{}"'.format(k)
        selected_num = len(v) * 5 if len(v) > size else size * 5
        shortlist_num = selected_num * 2
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        print 'len(selected_instances): ', len(selected_instances)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = selected_instances[
            'text'], selected_instances['label']
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 lowFreqK=10,
                                 classifier=MultinomialNB())
        resDict_transfer[k] = res

    print 'resDict_non2topic'
    print resDict_non2topic
    print 'resDict_transfer'
    print resDict_transfer
    print 'resDict_mix'
    print resDict_mix

    datestr = datetime.datetime.now().strftime('%y_%m_%d_%H_%M_%S')
    topicAccuRes = {
        '*topic': resDict_non2topic.keys(),
        'resDict_non2topic': [v[0] for v in resDict_non2topic.values()],
        'resDict_mix': [v[0] for v in resDict_mix.values()],
        'resDict_transfer': [v[0] for v in resDict_transfer.values()]
    }

    def getAVGRes(x):
        return [
            sum([v[i] for v in x.values()]) / len(topicData)
            for i in range(0, 7)
        ]

    avgRes = {
        'resDict_non2topic': getAVGRes(resDict_non2topic),
        'resDict_mix': getAVGRes(resDict_mix),
        'resDict_transfer': getAVGRes(resDict_transfer)
    }
    allres = {
        'resDict_non2topic': resDict_non2topic,
        'resDict_mix': resDict_mix,
        'resDict_transfer': resDict_transfer,
        'topicAccuRes': topicAccuRes,
        'avgRes': avgRes
    }
    print topicAccuRes.keys()
    for line in [[x[i] for x in topicAccuRes.values()]
                 for i in range(0, len(topicAccuRes.values()[0]))]:
        print line
    print avgRes
    if not os.path.isdir(topic_addr + '/result'):
        os.mkdir(topic_addr + '/result')
    saveResult(allres,
               save_addr=topic_addr + '/result/' + datestr + '_allres.txt')
Esempio n. 6
0
def run(instance_addr='./data/out_domain/all_review_no3.txt.gz',
        vecModel_addr='./data/word_vector_data/word2vec_glove.twitter.27B.100d.txt',
        para_w=[1, 1, 1],
        save_addr='./data/result/TASC_2_topic.txt',
        topic_addr='./data/topic/final'):

    topicData = readTopicData_final()
    # ------------ non to topic
    nonTopicData = readNonTopicText(addr='./data/non_topic/nontopicTrain.txt')
    resDict_non2topic = {}
    for k, v in topicData.iteritems():
        print 'test in topic "{}"'.format(k)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = nonTopicData['text'], nonTopicData['label']
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 lowFreqK=10,
                                 classifier=MultinomialNB())
        resDict_non2topic[k] = res
    saveResult(resDict_non2topic,
               save_addr=topic_addr + '/result/' + 'non2topic.txt')
    # ------------ transfer
    # 这里放在console执行时,要显示设置路径。console设置的默认当前路径是项目所在路径而不是py文件路径

    tasc = TASC(instance_addr=instance_addr, vecModel_addr=vecModel_addr)
    resDict_tasc_mix = {}
    size = 1200
    for k, v in topicData.iteritems():
        print 'test in topic "{}"'.format(k)
        selected_num = len(v) * 5 if len(v) > size else size * 5
        shortlist_num = selected_num * 2
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        print 'len(selected_instances): ', len(selected_instances)

        test_set, test_label = v['text'], v['label']
        train_set, train_label = pd.concat([
            nonTopicData['text'], selected_instances['text']
        ]), pd.concat([nonTopicData['label'], selected_instances['label']])
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 lowFreqK=10,
                                 classifier=MultinomialNB())  #
        resDict_tasc_mix[k] = res
    saveResult(resDict_tasc_mix,
               save_addr=topic_addr + '/result/all/' + str(size) +
               '_TASC_topic_transfer.txt')
    #-----------
    resDict_tasc_trans = {}
    for k, v in topicData.iteritems():
        print 'test in topic "{}"'.format(k)
        selected_num = len(v) * 5 if len(v) > size else size * 5
        shortlist_num = selected_num * 2
        selected_instances = tasc.get_instance_TASC(k, v, selected_num,
                                                    shortlist_num)
        print 'len(selected_instances): ', len(selected_instances)
        test_set, test_label = v['text'], v['label']
        train_set, train_label = selected_instances[
            'text'], selected_instances['label']
        res = classificationTest(train_set,
                                 train_label,
                                 test_set,
                                 test_label,
                                 lowFreqK=10,
                                 classifier=MultinomialNB())
        resDict_tasc_trans[k] = res

    saveResult(resDict_tasc_trans,
               save_addr=topic_addr + '/result/all/' + str(size) +
               '_TASC_topic_mix.txt')
    print resDict_non2topic
    print resDict_tasc_trans
    print resDict_tasc_mix