Ejemplo n.º 1
0
def main(*x, **r):
    # 1st r
    start_time = time.time()
    base = '/share/aagrawa8/Data/SE/'
    #base = '/home/amrit/GITHUB/LDAClassification/results/SE/'
    path = os.path.join(base, 'jaccard_tune_grow_oracle', r['file'],
                        str(r['term']))
    #path = os.path.join(base, 'untuned_svm_topics_smote', r['file'], str(r['term']))
    if not os.path.exists(path):
        os.makedirs(path)
    l = np.asarray(x)
    b = int(l[0])
    path1 = path + "/K_" + str(b) + "_a_" + str(l[1]) + "_b_" + str(
        l[2]) + ".txt"
    with open(path1, "w") as f:
        f.truncate()

    topics, tops, word, corpus, tar, log = _test_LDA(
        l,
        path1,
        file=r['file'],
        data_samples=r['data_samples'],
        target=r['target'])

    top = []
    fscore = svmtopics.main(data=tops,
                            file=r['file'],
                            target=tar,
                            tune=r['tune'])
    for i in topics:
        temp = str(i.encode('ascii', 'ignore'))
        top.append(temp)
    a = jaccard(b, score_topics=top, term=r['term'])
    fo = open(path1, 'a+')
    #fo.write("\nScore: " + str(a))
    fo.write("\nScore: " + str(a))
    fo.write("\nRuntime: --- %s seconds ---\n" % (time.time() - start_time))
    fo.close()

    return a, fscore
Ejemplo n.º 2
0
def _topics(res=''):
    #fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt']
    #fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3']
    filepath = '/share/aagrawa8/Data/SE/'
    start_time = time.time()
    #filepath='/Users/amrit/GITHUB/LDAClassification/dataset/SE/'

    random.seed(1)
    global bounds
    cross_tune='no'
    grow_oracle='yes'
    data_samples, labellist = readfile1(filepath + str(res)+'.txt')
    split = split_two(corpus=data_samples, label=labellist)
    pos = np.array(split['pos'])
    neg = np.array(split['neg'])
    cut_pos = int(len(pos) * 80 / 100)
    cut_neg = int(len(neg) * 80 / 100)
    ##list of f2 scores
    tunedlis=[]
    untunedlis=[]
    #dictionary containing bestone, time for 1 run, f2
    cross={}
    #dictionary containing cross, lis,full time
    file={}
    for folds in range(5):
        start_time1 = time.time()
        pos_shuffle = range(0, len(pos))
        neg_shuffle=range(0, len(neg))
        shuffle(pos_shuffle)
        shuffle(neg_shuffle)
        pos=pos[pos_shuffle]
        neg=neg[neg_shuffle]

        data_train, train_label=list(pos)[:cut_pos]+list(neg)[:cut_neg],['pos']*cut_pos + ['neg']*cut_neg
        data_test, test_label = list(pos)[cut_pos:]+list(neg)[cut_neg:], ['pos']*(len(pos)-cut_pos) + ['neg']*(len(neg)-cut_neg)

        # stability score format dict, file,lab=score
        # parameter variations (k,a,b), format, list of lists, file,lab=[[k,a,b], Rn score, fscore]
        #final_para_dic={}
        # final paras and scores, file, lab=[[k,a,b],[r, f1]]
        de = DE(F=0.7, CR=0.3, x='rand')

        global max_fitness
        max_fitness = 0
        pop = [[random.randint(bounds[0][0], bounds[0][1]), random.uniform(bounds[1][0], bounds[1][1]),
                        random.uniform(bounds[2][0], bounds[2][1])]
                       for _ in range(10)]
        v, score, final_para_dic = de.solve(main, pop, iterations=3, file=res, term=7, data_samples=data_train,target=train_label,tune='yes')
        ##score is a list of [jaccard and fscore]
        bestone = [v,score]
        # runtime,format dict, file,=runtime in secs
        l=bestone
        tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        tf = tf_vectorizer.fit_transform(data_train+data_test)
        lda1 = lda.LDA(n_topics=int(l[0][0]), alpha=l[0][1], eta=l[0][2], n_iter=200)
        lda1.fit_transform(tf)
        tops = lda1.doc_topic_
        tops = csr_matrix(tops)
        tops = l2normalize(tops).toarray()
        f2=svmtopics.main(data=tops, file=res, target=train_label+test_label,tune='no')
        tunedlis.append(f2)

        ## untuned experiment
        lda2 = lda.LDA(n_topics=20, alpha=0.1, eta=0.01, n_iter=200)
        lda2.fit_transform(tf)
        tops1 = lda2.doc_topic_
        tops1 = csr_matrix(tops1)
        tops1 = l2normalize(tops1).toarray()
        untuned_f2 = svmtopics.main(data=tops1, file=res, target=train_label + test_label, tune='no')
        untunedlis.append(untuned_f2)

        time2 = time.time() - start_time1
        cross[folds] = [ bestone,time2,f2,untuned_f2]

        print("\nRuntime for 1 loop of DE termination: --- %s seconds ---\n" % (time2))
    time1=time.time() - start_time
    print(tunedlis)
    print(untunedlis)
    file[res]=[cross,tunedlis,untunedlis,time1]
    print("\nTotal Runtime: --- %s seconds ---\n" % (time.time() - start_time))
    with open('dump/DE_jaccard_tune_grow_oracle'+res+'.pickle', 'wb') as handle:
        pickle.dump(file, handle)


    ##untuned experiment
    '''l={}
Ejemplo n.º 3
0
def _topics(res=''):
    #fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt']
    #fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3']
    filepath = '/share/aagrawa8/Data/SE/'
    start_time = time.time()
    #filepath='/Users/amrit/GITHUB/LDAClassification/dataset/SE/'

    data_samples, labellist = readfile1(filepath + str(res) + '.txt')
    split = split_two(corpus=data_samples, label=labellist)
    pos = split['pos']
    neg = split['neg']
    cut_pos = int(len(pos) * 80 / 100)
    cut_neg = int(len(neg) * 80 / 100)
    data_train, train_label = pos[:cut_pos] + neg[:cut_neg], [
        'pos'
    ] * cut_pos + ['neg'] * cut_neg
    data_test, test_label = pos[cut_pos:] + neg[cut_neg:], ['pos'] * (
        len(pos) - cut_pos) + ['neg'] * (len(neg) - cut_neg)
    labels = [7]  #[1, 2, 3, 4, 5, 6, 7, 8, 9]
    random.seed(1)
    global bounds
    # stability score format dict, file,lab=score
    result = {}
    # parameter variations (k,a,b), format, list of lists, file,lab=[[k,a,b], Rn score, fscore]
    final_para_dic = {}
    # final paras and scores, file, lab=[[k,a,b],[r, f1]]
    bestone = {}
    de = DE(F=0.7, CR=0.3, x='rand')
    temp1 = {}
    temp2 = {}
    temp3 = {}
    for lab in labels:
        global max_fitness
        max_fitness = 0
        #print(res+'\t'+str(lab))
        pop = [[
            random.randint(bounds[0][0], bounds[0][1]),
            random.uniform(bounds[1][0], bounds[1][1]),
            random.uniform(bounds[2][0], bounds[2][1])
        ] for _ in range(10)]
        v, score, l = de.solve(main,
                               pop,
                               iterations=5,
                               file=res,
                               term=lab,
                               data_samples=data_train,
                               target=train_label,
                               tune='yes')
        temp1[lab] = l
        ##score is a list of [jaccard and fscore]
        print(v, '->', score)
        temp3[lab] = score
        temp2[lab] = [v, score]
    result[res] = temp3
    final_para_dic[res] = temp1
    bestone[res] = temp2

    print(result)
    print(bestone)
    print(final_para_dic)
    time1 = {}

    # runtime,format dict, file,=runtime in secs
    time1[res] = time.time() - start_time
    l = bestone[res][7]
    print(l)
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(data_train + data_test)
    lda1 = lda.LDA(n_topics=int(l[0][0]),
                   alpha=l[0][1],
                   eta=l[0][2],
                   n_iter=200)
    lda1.fit_transform(tf)
    tops = lda1.doc_topic_
    fscore = {}
    fscore[res] = svmtopics.main(data=tops,
                                 file=res,
                                 target=train_label + test_label,
                                 tune='no')
    print(fscore)
    temp = [result, final_para_dic, bestone, time1, fscore]
    with open('dump/DE_class_topics_' + res + '.pickle', 'wb') as handle:
        pickle.dump(temp, handle)
    print("\nTotal Runtime: --- %s seconds ---\n" % (time1[res]))

    ##untuned experiment
    '''l={}
Ejemplo n.º 4
0
def _topics(res=''):
    # fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt']
    # fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3']
    filepath = '/share/aagrawa8/Data/SE/'
    start_time = time.time()
    #filepath = '/Users/amrit/GITHUB/LDAClassification/dataset/SE/'

    random.seed(1)
    global bounds
    cross_tune = 'no'
    grow_oracle = 'yes'
    data_samples, labellist = readfile1(filepath + str(res) + '.txt')
    split = split_two(corpus=data_samples, label=labellist)
    pos = np.array(split['pos'])
    neg = np.array(split['neg'])

    cut_pos, cut_neg = cut_position(pos, neg, percentage=20)
    ##list of f2 scores
    untuned_lis = []
    tuned_lis = []
    # dictionary containing bestone, time for 1 run, f2
    cross = {}
    # dictionary containing cross, lis,full time
    file = {}
    for folds in range(5):
        start_time1 = time.time()
        pos_shuffle = range(0, len(pos))
        neg_shuffle = range(0, len(neg))
        shuffle(pos_shuffle)
        shuffle(neg_shuffle)
        pos = pos[pos_shuffle]
        neg = neg[neg_shuffle]
        data_train, train_label, data_test, test_label = divide_train_test(
            pos, neg, cut_pos, cut_neg)

        de = DE(F=0.7, CR=0.3, x='rand')

        global max_fitness
        max_fitness = 0
        pop = [[
            random.randint(bounds[0][0], bounds[0][1]),
            random.uniform(bounds[1][0], bounds[1][1]),
            random.uniform(bounds[2][0], bounds[2][1])
        ] for _ in range(10)]
        v, score, final_para_dic = de.solve(main,
                                            pop,
                                            iterations=3,
                                            bounds=bounds,
                                            file=res,
                                            term=7,
                                            data_samples=data_train,
                                            target=train_label,
                                            tune='yes')
        ##score is a list of [jaccard and fscore]
        bestone = [v, score]
        # runtime,format dict, file,=runtime in secs
        l = bestone

        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        stop_words='english')
        tf = tf_vectorizer.fit_transform(data_train + data_test)
        lda1 = lda.LDA(n_topics=int(l[0][0]),
                       alpha=l[0][1],
                       eta=l[0][2],
                       n_iter=200)
        lda1.fit_transform(tf)
        tops = lda1.doc_topic_

        split = split_two(corpus=tops,
                          label=np.array(train_label + test_label))
        pos1 = np.array(split['pos'])
        neg1 = np.array(split['neg'])
        data_train, train_label, data_test, test_label = divide_train_test(
            pos1, neg1, cut_pos, cut_neg)

        ## Run with default features
        perc = len(train_label) * 100 / len(train_label + test_label)

        weight_length = int(l[0][0])
        new_bounds = bound * weight_length
        pop1 = [1.0 for _ in range(weight_length)]
        f21 = svmtopics.main(*pop1,
                             data=data_train + data_test,
                             target=train_label + test_label,
                             tune='no',
                             percentage=perc)
        untuned_lis.append(f21)
        time2 = time.time() - start_time1
        bestone.append(time2)

        split1 = split_two(corpus=data_test, label=np.array(test_label))
        pos1 = np.array(split1['pos'])
        neg1 = np.array(split1['neg'])
        cut_pos1, cut_neg1 = cut_position(pos1, neg1, percentage=50)
        data_grow, grow_label, data_test, test_label = divide_train_test(
            pos1, neg1, cut_pos1, cut_neg1)

        start_time2 = time.time()

        ## Another DE to find the magic weights
        max_fitness = 0
        pop = [[
            random.uniform(bound[0][0], bound[0][1])
            for _ in range(weight_length)
        ] for _ in range(10)]
        perc1 = (len(train_label) +
                 len(grow_label) / 2) * 100 / len(train_label + grow_label)
        v, score, final_para_dic = de.solve(svmtopics.main,
                                            pop,
                                            iterations=6,
                                            bounds=new_bounds,
                                            data=data_train + data_grow,
                                            target=train_label + grow_label,
                                            tune='no',
                                            percentage=perc1)
        bestone1 = [v, score]
        perc1 = (len(train_label) + len(grow_label)
                 ) * 100 / len(train_label + grow_label + test_label)

        #testing the modified features.
        f22 = svmtopics.main(*v,
                             data=data_train + data_grow + data_test,
                             target=train_label + grow_label + test_label,
                             tune='no',
                             percentage=perc1)
        time3 = time.time() - start_time2
        bestone1.append(time3)
        tuned_lis.append(f22)
        cross[folds] = [bestone, bestone1, f21, f22]

        print("\nRuntime for 1 loop of DE termination: --- %s seconds ---\n" %
              (time2 + time3))
    time1 = time.time() - start_time
    file[res] = [cross, untuned_lis, tuned_lis, time1]
    print(file[res])
    print("\nTotal Runtime: --- %s seconds ---\n" % (time.time() - start_time))
    with open('dump/DE_magic_weights_' + res + '.pickle', 'wb') as handle:
        pickle.dump(file, handle)
Ejemplo n.º 5
0
def _topics(res=''):
    #fileB = ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF', 'processed_citemap.txt']
    #fileB = ['SE0', 'SE6', 'SE1', 'SE8', 'SE3']
    filepath = '/share/aagrawa8/Data/SE/'
    start_time = time.time()
    #filepath='/home/amrit/GITHUB/LDAClassification/dataset/SE/'

    data_samples, labellist = readfile1(filepath + str(res) + '.txt')
    labels = [7]  #[1, 2, 3, 4, 5, 6, 7, 8, 9]
    random.seed(1)
    global bounds
    # stability score format dict, file,lab=score
    result = {}
    # parameter variations (k,a,b), format, dict, file,lab,each score=k,a,b
    final_para_dic = {}
    # final generation, format dict, file,lab=parameter, score
    final_current_dic = {}
    de = DE(F=0.7, CR=0.3, x='rand')
    temp1 = {}
    temp2 = {}
    temp3 = {}
    for lab in labels:
        global max_fitness
        max_fitness = 0
        #print(res+'\t'+str(lab))
        pop = [[
            random.randint(bounds[0][0], bounds[0][1]),
            random.uniform(bounds[1][0], bounds[1][1]),
            random.uniform(bounds[2][0], bounds[2][1])
        ] for _ in range(10)]
        v, score, para_dict, gen = de.solve(main,
                                            pop,
                                            iterations=3,
                                            file=res,
                                            term=lab,
                                            data_samples=data_samples,
                                            target=labellist)
        temp1[lab] = para_dict
        #temp2[lab]=gen
        #print(v, '->', score)

        temp3[lab] = score
    result[res] = temp3
    final_para_dic[res] = temp1
    #final_current_dic[res]=temp2
    print(result)
    #print(final_current_dic)
    print(final_para_dic)
    time1 = {}

    ## Running the lda again with max score
    l = final_para_dic[res][7][result[res][7]]
    print(l)
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    lda1 = lda.LDA(n_topics=int(l[0][0]),
                   alpha=l[0][1],
                   eta=l[0][2],
                   n_iter=100)
    lda1.fit_transform(tf)
    tops = lda1.doc_topic_
    fscore = {}
    fscore[res] = svmtopics.main(data=tops, file=res, target=labellist)

    # runtime,format dict, file,=runtime in secs
    time1[res] = time.time() - start_time
    temp = [result, final_para_dic, time1, fscore]
    with open('dump/DE_class_topics_' + res + '.pickle', 'wb') as handle:
        pickle.dump(temp, handle)
    print("\nTotal Runtime: --- %s seconds ---\n" % (time1[res]))

    ##untuned experiment
    '''tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')