def Estimate(filename, old_files = [], n=10, learner='lr'): read = MAR() read = read.create(filename, old_files) result = [] for i in range(n): np.random.seed(i) a = read.body[['projectname', 'label']] b = a.loc[a['label'] == 'yes'] total_df = a.groupby(['projectname']).count() yes_df = b.groupby(['projectname']).count() df = pd.DataFrame() df[['total']] = total_df[['label']] df[['pos']] = yes_df[['label']] test_file = filename.rsplit('.',1)[0] test_series = df.loc[test_file] train_df = df.drop([test_file]) x_train = list(train_df.total.values) y_train = list(train_df.pos.values) if learner == 'lr': clf = LogisticRegression(random_state=i) elif learner == 'dt': clf = DecisionTreeClassifier(random_state=i) elif learner == 'svm_linear': clf = svm.SVC(kernel='linear', random_state=i) elif learner == 'nbm': clf = MultinomialNB(alpha=1) x_train = np.reshape(x_train, (-1, 1)) clf.fit(x_train, y_train) res = clf.predict(test_series['total']) result.append(res[0]) print(test_file, result) return result
def START_AUTO(filename): read = MAR() read = read.create(filename) pos_last = 0 full_life = 3 life = full_life while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos >= 10: if pos == pos_last: life = life - 1 if life == 0: break else: life = full_life if pos == 0: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train() for id in ids: read.code(id, read.body["label"][id]) pos_last = pos return read
def REUSE_RANDOM(filename, old): stop = 0.9 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() # print("%d/ %d" % (pos,pos+neg)) if pos >= target: break a, b, ids, c = read.train_reuse_random() for id in ids: read.code(id, read.body["label"][id]) return read
def UPDATE(filename, old, pne=False, cl="RF"): stop = 1 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos >= target: break a, b, ids, c = read.train(pne=pne, cl=cl) for id in ids: read.code(id, read.body["label"][id]) return read
def LINEAR(filename): read = MAR() read = read.create(filename) while True: pos, neg, total = read.get_numbers() if total - (pos + neg) < 10: break for id in read.random(): read.code(id, read.body["label"][id]) return read
def LOC(filename): stop = 1 read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() print("%d, %d" % (pos, pos + neg)) if pos >= target: break for id in read.loc_sort(): read.code(id, read.body["label"][id]) return read
def UPDATE_AUTO(filename, old, pne=True): read = MAR() read = read.create(filename) read.create_old(old) pos_last = -1 full_life = 5 life = full_life while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos == pos_last: life = life - 1 if life == 0: break else: life = full_life a, b, ids, c = read.train(pne) for id in ids: read.code(id, read.body["label"][id]) pos_last = pos return read
from __future__ import print_function, division import os import sys root = os.getcwd().split("src")[0] + "src/src/util" sys.path.append(root) from flask import Flask, url_for, render_template, request, jsonify, Response, json from pdb import set_trace from mar import MAR app = Flask(__name__, static_url_path='/static') global target target = MAR() global clf clf = [] @app.route('/hello/') def hello(): return render_template('hello.html') @app.route('/load', methods=['POST']) def load(): global target file = request.form['file'] target = target.create(file) pos, neg, total = target.get_numbers()
def START(filename, cl="linear"): stop = 1 thres = 40 # thres=10000000000 read = MAR() read = read.create(filename) target = int(read.get_allbugs() * stop) while True: found, cost, total = read.get_numbers() try: print("%d, %d" % (found, cost)) except: pass if found >= target: break if found == 0 or cost < thres: for id in read.loc_sort(): read.code(id, read.body["label"][id]) else: ids, c = read.train(cl=cl) for id in ids: read.code(id, read.body["label"][id]) read.plot() set_trace() return read
def test_semi(filename="Hall.csv"): num = 20 read = MAR() read = read.create(filename) poses = np.where(np.array(read.body['label']) == "yes")[0] negs = np.where(np.array(read.body['label']) == "no")[0] pos_sel = np.random.choice(poses, num, replace=False) neg_sel = np.random.choice(negs, num*10, replace=False) for id in pos_sel: read.code_error(id) for id in neg_sel: read.code_error(id) read.enable_est = True read.get_numbers() a,b,c,d = read.train() set_trace()
def test(filename): p = 5 for i in xrange(10): num = 10*(i+1) read = MAR() read = read.create(filename,partitions = p) poses = np.where(np.array(read.body['label']) == "yes")[0] negs = np.where(np.array(read.body['label']) == "no")[0] pos_sel = np.random.choice(poses, num, replace=False) neg_sel = np.random.choice(negs, num*10, replace=False) for id in pos_sel: read.code_error(id) for id in neg_sel: read.code_error(id) read.get_numbers() start = time.time() a,b,c,d = read.train_para() duration = time.time()-start print(duration) read.get_numbers() start = time.time() a,b,c,d = read.train() duration2 = time.time()-start print(duration2)
def TEST_AL(filename, old_files=[], stop='est', stopat=1, error='none', interval=100000, starting=1, seed=0, step=10): stopat = float(stopat) thres = 0 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos < starting or pos + neg < thres: for id in read.random(): read.code_error(id, error=error) else: a, b, c, d = read.train(weighting=True, pne=True) if pos >= target and read.est_num * stopat <= pos: break for id in c: read.code_error(id, error=error) # read.export() # results = analyze(read) # print(results) # read.plot() return read
def Codes(filename, code): stop = 0.95 thres = 0 if "P" in code: starting = 5 else: starting = 1 weighting = "W" in code or "M" in code uncertain = "U" in code stopping = "S" in code read = MAR() read = read.create(filename) read.restart() read = MAR() read = read.create(filename) if not ("A" in code or "M" in code): read.enough = 100000 target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() # print("%d, %d" %(pos,pos+neg)) if pos >= target: break if pos < starting or pos + neg < thres: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, c, d, e = read.train(weighting=weighting) if pos < 30 and uncertain: for id in a: read.code(id, read.body["label"][id]) else: if stopping: now = 0 while pos < target: for id in e[now:now + read.step]: read.code(id, read.body["label"][id]) pos, neg, total = read.get_numbers() now = now + read.step else: for id in c: read.code(id, read.body["label"][id]) return read
def active_learning(filename, query='', stop='true', stopat=0.95, error='none', interval=100000, seed=0): stopat = float(stopat) thres = 0 starting = 1 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename) read.interval = interval read.BM25(query.strip().split('_')) num2 = read.get_allpos() target = int(num2 * stopat) print("number of target, true/close here:", target) if stop == 'est': read.enable_est = True else: read.enable_est = False while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: if stop == 'knee' and error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined")[0] seq = coded[np.argsort(read.body['time'][coded])] part1 = set(seq[:read.kneepoint * read.step]) & set( np.where(np.array(read.body['code']) == "no")[0]) part2 = set(seq[read.kneepoint * read.step:]) & set( np.where(np.array(read.body['code']) == "yes")[0]) for id in part1 | part2: read.code_error(id, error=error) break if pos < starting or pos + neg < thres: for id in read.BM25_get(): read.code_error(id, error=error) else: a, b, c, d = read.train(weighting=True, pne=True) if stop == 'est': if stopat * read.est_num <= pos: break elif stop == 'soft': if pos >= 10 and pos_last == pos: counter = counter + 1 else: counter = 0 pos_last = pos if counter >= 5: break elif stop == 'knee': if pos >= 10: if read.knee(): if error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined" )[0] seq = coded[np.argsort( np.array(read.body['time'])[coded])] part1 = set( seq[:read.kneepoint * read.step]) & set( np.where( np.array(read.body['code']) == "no") [0]) part2 = set( seq[read.kneepoint * read.step:]) & set( np.where( np.array(read.body['code']) == "yes") [0]) for id in part1 | part2: read.code_error(id, error=error) break else: if pos >= target: break if pos < 10: for id in a: read.code_error(id, error=error) else: for id in c: read.code_error(id, error=error) set_trace() return read
def ERROR(filename): read = MAR() read = read.create(filename) read.lda() read.syn_error()
def REUSE(filename, old, pne=True): stop = 0.9 thres = 5 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos >= target: break if pos < thres: a, b, ids, c = read.train(pne) for id in ids: read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train_reuse(pne) for id in ids: read.code(id, read.body["label"][id]) return read
def Supervised(filename, old_files=[], stop='est', stopat=1, error='none', interval=100000, starting=1, seed=0, step=10): stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False read.train_supervised() pos, neg, total = read.get_numbers() read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos >= target and read.est_num * stopat <= pos: break for id in read.query_supervised()[:read.step]: read.code_error(id, error=error) return read
def TIME_START(filename): stop = 0.9 read = MAR() read = read.create(filename) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() # print("%d/ %d" % (pos,pos+neg)) if pos >= target: break if pos == 0: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train_kept() for id in ids: read.code(id, read.body["label"][id]) return read
def START_est(filename): stop = 0.90 thres = 40 flag = True read = MAR() read = read.create(filename) read.restart() read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() # print("%d, %d" %(pos,pos+neg)) if pos >= target: break if pos == 0 or pos + neg < thres: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train(pne=True) if pos >= 60 and flag: read.cache_est() # read.xx=read.simcurve['x'] # read.yy=read.simcurve['pos'] flag = False for id in ids: read.code(id, read.body["label"][id]) return read
def export(file): read = MAR() read = read.create_lda(file) read.export_feature()
def START_LOC(filename, cl="SVM-linear"): stop = 1 read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() print("%d, %d" % (pos, pos + neg)) if pos >= target: break if pos == 0 or pos + neg < 40: for id in read.loc_sort(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train(cl=cl) for id in ids: read.code(id, read.body["label"][id]) return read
def START_DOC2VEC(filename): stop = 0.95 thres = 40 read = MAR() read = read.create(filename) read.restart() read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() print("%d, %d" % (pos, pos + neg)) if pos >= target: break if pos == 0 or pos + neg < thres: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, c, d, e = read.train(weighting=True) for id in c: read.code(id, read.body["label"][id]) return read
def UPDATE_REUSE(filename, old): stop = 0.9 lifes = 2 life = lifes last_pos = 0 thres = 5 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() # print("%d/ %d" % (pos, pos + neg)) if pos - last_pos: life = lifes else: life = life - 1 last_pos = pos if pos >= target: break # if (pos >= thres or pos==0) and life<1: if (pos >= thres) and life < 1: # print("reuse") lifes = 0 a, b, ids, c = read.train_reuse() for id in ids: read.code(id, read.body["label"][id]) else: # print("update") a, b, ids, c = read.train() for id in ids: read.code(id, read.body["label"][id]) return read
def active_learning(filename, query='', stop='true', stopat=1.00, error='none', interval=100000, seed=0): stopat = float(stopat) thres = 0 starting = 1 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename) # random sampling or by querying similar documents # self.bm is provided with a list or a view of a dict's value which is not sorted read.BM25(query.strip().split('_')) # get the rest #pos documents num2 = read.get_allpos() target = int( num2 * stopat ) # stopat is 1. Is it the minum num of pos to activate svm training ? if stop == 'est': # stop = 'true' read.enable_est = True else: read.enable_est = False # will excute this line while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) # what is est_num ? except: print("%d, %d" % (pos, pos + neg)) # execute this line if pos + neg >= total: # do not go inside if stop == 'knee' and error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined")[0] seq = coded[np.argsort(read.body['time'][coded])] part1 = set(seq[:read.kneepoint * read.step]) & set( np.where(np.array(read.body['code']) == "no")[0]) part2 = set(seq[read.kneepoint * read.step:]) & set( np.where(np.array(read.body['code']) == "yes")[0]) for id in part1 | part2: read.code_error(id, error=error) break if pos < starting or pos + neg < thres: # the second condition doesn't work for id in read.BM25_get( ): # select a set of candidates from self.pool read.code_error( id, error=error ) # simulate human labeling error, default is no error else: a, b, c, d = read.train(weighting=True, pne=True) if stop == 'est': if stopat * read.est_num <= pos: break elif stop == 'soft': if pos >= 10 and pos_last == pos: counter = counter + 1 else: counter = 0 pos_last = pos if counter >= 5: break elif stop == 'knee': if pos >= 10: if read.knee(): if error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined" )[0] seq = coded[np.argsort( np.array(read.body['time'])[coded])] part1 = set( seq[:read.kneepoint * read.step]) & set( np.where( np.array(read.body['code']) == "no") [0]) part2 = set( seq[read.kneepoint * read.step:]) & set( np.where( np.array(read.body['code']) == "yes") [0]) for id in part1 | part2: read.code_error(id, error=error) break else: if pos >= target: break if pos < 10: for id in a: read.code_error(id, error=error) else: for id in c: read.code_error(id, error=error) return read
def Boosting(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename,old_files) read.step = step read.interval = interval util.vote(read) num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False pos, neg, total = read.get_numbers() read.query_boost() read.record['est'][0]= read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" %(pos,pos+neg, read.est_num)) except: print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break if read.enable_est and read.est_num*stopat<= pos: break for id in read.query_boost()[:read.step]: read.code_error(id, error=error) return read
def Supervised(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10, learner='svm_linear', boost=None): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval read.seed = seed if boost: util.vote(read, clf_name=boost, seed=seed, all=False, temp=str(seed) + filename) return num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False if boost == None: read.train_supervised(learner, seed) pos, neg, total = read.get_numbers() if boost: read.query_boost() else: read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() # try: # print("%d, %d, %d" %(pos,pos+neg, read.est_num)) # except: # print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break # if pos >= target and (pos+neg) >= total * .22 and read.enable_est and read.est_num*stopat<= pos: # break if boost: ids = read.query_boost()[:read.step] else: ids = read.query_supervised()[:read.step] read.code_batch(ids) return read
def START_ERROR(filename): read = MAR() read = read.create(filename) pos_last = 0 full_life = 3 human_error = 0.2 life = full_life while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos >= 10: if pos == pos_last: life = life - 1 if life == 0: break else: life = full_life if pos == 0: for id in read.random(): if read.body["label"][id] == "no": if random.random() < human_error**2: hl = "yes" else: hl = "no" elif read.body["label"][id] == "yes": if random.random() < 2 * (human_error - human_error**2): hl = "no" else: hl = "yes" read.code(id, hl) else: a, b, ids, c = read.train() for id in ids: if read.body["label"][id] == "no": if random.random() < human_error**2: hl = "yes" else: hl = "no" elif read.body["label"][id] == "yes": if random.random() < 2 * (human_error - human_error**2): hl = "no" else: hl = "yes" read.code(id, hl) pos_last = pos read.export() return read
from __future__ import print_function, division import sys, os root = os.getcwd().split("src")[0] + "src/src/util" sys.path.append(root) from mar import MAR from pdb import set_trace if __name__ == "__main__": data_path = "Hall.csv" target_recall = 0.95 thres = 10 query = "defect prediction" read = MAR() read = read.create(data_path) read.enable_est = True if query: read.BM25(query.split()) while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos < 1: if query: ids, scores = read.BM25_get()