def START_est(filename): stop = 0.90 thres = 40 flag = True read = MAR() read = read.create(filename) read.restart() read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() # print("%d, %d" %(pos,pos+neg)) if pos >= target: break if pos == 0 or pos + neg < thres: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train(pne=True) if pos >= 60 and flag: read.cache_est() # read.xx=read.simcurve['x'] # read.yy=read.simcurve['pos'] flag = False for id in ids: read.code(id, read.body["label"][id]) return read
def Supervised(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10, learner='svm_linear', boost=None): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval read.seed = seed if boost: util.vote(read, clf_name=boost, seed=seed, all=False, temp=str(seed) + filename) return num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False if boost == None: read.train_supervised(learner, seed) pos, neg, total = read.get_numbers() if boost: read.query_boost() else: read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() # try: # print("%d, %d, %d" %(pos,pos+neg, read.est_num)) # except: # print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break # if pos >= target and (pos+neg) >= total * .22 and read.enable_est and read.est_num*stopat<= pos: # break if boost: ids = read.query_boost()[:read.step] else: ids = read.query_supervised()[:read.step] read.code_batch(ids) return read
def TEST_AL(filename, old_files=[], stop='est', stopat=1, error='none', interval=100000, starting=1, seed=0, step=10): stopat = float(stopat) thres = 0 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos < starting or pos + neg < thres: for id in read.random(): read.code_error(id, error=error) else: a, b, c, d = read.train(weighting=True, pne=True) if pos >= target and read.est_num * stopat <= pos: break for id in c: read.code_error(id, error=error) # read.export() # results = analyze(read) # print(results) # read.plot() return read
def LOC(filename): stop = 1 read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() print("%d, %d" % (pos, pos + neg)) if pos >= target: break for id in read.loc_sort(): read.code(id, read.body["label"][id]) return read
def REUSE_RANDOM(filename, old): stop = 0.9 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() # print("%d/ %d" % (pos,pos+neg)) if pos >= target: break a, b, ids, c = read.train_reuse_random() for id in ids: read.code(id, read.body["label"][id]) return read
def UPDATE(filename, old, pne=False, cl="RF"): stop = 1 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos >= target: break a, b, ids, c = read.train(pne=pne, cl=cl) for id in ids: read.code(id, read.body["label"][id]) return read
def Supervised(filename, old_files=[], stop='est', stopat=1, error='none', interval=100000, starting=1, seed=0, step=10): stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename, old_files) read.step = step read.interval = interval num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False read.train_supervised() pos, neg, total = read.get_numbers() read.query_supervised() read.record['est'][0] = read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: break if pos >= target and read.est_num * stopat <= pos: break for id in read.query_supervised()[:read.step]: read.code_error(id, error=error) return read
def Codes(filename, code): stop = 0.95 thres = 0 if "P" in code: starting = 5 else: starting = 1 weighting = "W" in code or "M" in code uncertain = "U" in code stopping = "S" in code read = MAR() read = read.create(filename) read.restart() read = MAR() read = read.create(filename) if not ("A" in code or "M" in code): read.enough = 100000 target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() # print("%d, %d" %(pos,pos+neg)) if pos >= target: break if pos < starting or pos + neg < thres: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, c, d, e = read.train(weighting=weighting) if pos < 30 and uncertain: for id in a: read.code(id, read.body["label"][id]) else: if stopping: now = 0 while pos < target: for id in e[now:now + read.step]: read.code(id, read.body["label"][id]) pos, neg, total = read.get_numbers() now = now + read.step else: for id in c: read.code(id, read.body["label"][id]) return read
def START_LOC(filename, cl="SVM-linear"): stop = 1 read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() print("%d, %d" % (pos, pos + neg)) if pos >= target: break if pos == 0 or pos + neg < 40: for id in read.loc_sort(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train(cl=cl) for id in ids: read.code(id, read.body["label"][id]) return read
def Boosting(filename, old_files = [], stop='', stopat=1, error='none', interval = 100000, starting =1, seed=0, step =10): print("FILENAME: ", filename, "OLDFILES: ", len(old_files)) stopat = float(stopat) np.random.seed(seed) read = MAR() read = read.create(filename,old_files) read.step = step read.interval = interval util.vote(read) num2 = read.get_allpos() target = int(num2 * stopat) if stop == 'est': read.enable_est = True else: read.enable_est = False pos, neg, total = read.get_numbers() read.query_boost() read.record['est'][0]= read.est_num while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" %(pos,pos+neg, read.est_num)) except: print("%d, %d" %(pos,pos+neg)) if pos + neg >= total: break if read.enable_est and read.est_num*stopat<= pos: break for id in read.query_boost()[:read.step]: read.code_error(id, error=error) return read
def TIME_START(filename): stop = 0.9 read = MAR() read = read.create(filename) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() # print("%d/ %d" % (pos,pos+neg)) if pos >= target: break if pos == 0: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train_kept() for id in ids: read.code(id, read.body["label"][id]) return read
def START_DOC2VEC(filename): stop = 0.95 thres = 40 read = MAR() read = read.create(filename) read.restart() read = MAR() read = read.create(filename) target = int(read.get_allpos() * stop) while True: pos, neg, total = read.get_numbers() print("%d, %d" % (pos, pos + neg)) if pos >= target: break if pos == 0 or pos + neg < thres: for id in read.random(): read.code(id, read.body["label"][id]) else: a, b, c, d, e = read.train(weighting=True) for id in c: read.code(id, read.body["label"][id]) return read
def REUSE(filename, old, pne=True): stop = 0.9 thres = 5 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() print("%d/ %d" % (pos, pos + neg)) if pos >= target: break if pos < thres: a, b, ids, c = read.train(pne) for id in ids: read.code(id, read.body["label"][id]) else: a, b, ids, c = read.train_reuse(pne) for id in ids: read.code(id, read.body["label"][id]) return read
def UPDATE_REUSE(filename, old): stop = 0.9 lifes = 2 life = lifes last_pos = 0 thres = 5 read = MAR() read = read.create(filename) read.create_old(old) num2 = read.get_allpos() target = int(num2 * stop) while True: pos, neg, total = read.get_numbers() # print("%d/ %d" % (pos, pos + neg)) if pos - last_pos: life = lifes else: life = life - 1 last_pos = pos if pos >= target: break # if (pos >= thres or pos==0) and life<1: if (pos >= thres) and life < 1: # print("reuse") lifes = 0 a, b, ids, c = read.train_reuse() for id in ids: read.code(id, read.body["label"][id]) else: # print("update") a, b, ids, c = read.train() for id in ids: read.code(id, read.body["label"][id]) return read
def active_learning(filename, query='', stop='true', stopat=1.00, error='none', interval=100000, seed=0): stopat = float(stopat) thres = 0 starting = 1 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename) # random sampling or by querying similar documents # self.bm is provided with a list or a view of a dict's value which is not sorted read.BM25(query.strip().split('_')) # get the rest #pos documents num2 = read.get_allpos() target = int( num2 * stopat ) # stopat is 1. Is it the minum num of pos to activate svm training ? if stop == 'est': # stop = 'true' read.enable_est = True else: read.enable_est = False # will excute this line while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) # what is est_num ? except: print("%d, %d" % (pos, pos + neg)) # execute this line if pos + neg >= total: # do not go inside if stop == 'knee' and error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined")[0] seq = coded[np.argsort(read.body['time'][coded])] part1 = set(seq[:read.kneepoint * read.step]) & set( np.where(np.array(read.body['code']) == "no")[0]) part2 = set(seq[read.kneepoint * read.step:]) & set( np.where(np.array(read.body['code']) == "yes")[0]) for id in part1 | part2: read.code_error(id, error=error) break if pos < starting or pos + neg < thres: # the second condition doesn't work for id in read.BM25_get( ): # select a set of candidates from self.pool read.code_error( id, error=error ) # simulate human labeling error, default is no error else: a, b, c, d = read.train(weighting=True, pne=True) if stop == 'est': if stopat * read.est_num <= pos: break elif stop == 'soft': if pos >= 10 and pos_last == pos: counter = counter + 1 else: counter = 0 pos_last = pos if counter >= 5: break elif stop == 'knee': if pos >= 10: if read.knee(): if error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined" )[0] seq = coded[np.argsort( np.array(read.body['time'])[coded])] part1 = set( seq[:read.kneepoint * read.step]) & set( np.where( np.array(read.body['code']) == "no") [0]) part2 = set( seq[read.kneepoint * read.step:]) & set( np.where( np.array(read.body['code']) == "yes") [0]) for id in part1 | part2: read.code_error(id, error=error) break else: if pos >= target: break if pos < 10: for id in a: read.code_error(id, error=error) else: for id in c: read.code_error(id, error=error) return read
def active_learning(filename, query='', stop='true', stopat=0.95, error='none', interval=100000, seed=0): stopat = float(stopat) thres = 0 starting = 1 counter = 0 pos_last = 0 np.random.seed(seed) read = MAR() read = read.create(filename) read.interval = interval read.BM25(query.strip().split('_')) num2 = read.get_allpos() target = int(num2 * stopat) print("number of target, true/close here:", target) if stop == 'est': read.enable_est = True else: read.enable_est = False while True: pos, neg, total = read.get_numbers() try: print("%d, %d, %d" % (pos, pos + neg, read.est_num)) except: print("%d, %d" % (pos, pos + neg)) if pos + neg >= total: if stop == 'knee' and error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined")[0] seq = coded[np.argsort(read.body['time'][coded])] part1 = set(seq[:read.kneepoint * read.step]) & set( np.where(np.array(read.body['code']) == "no")[0]) part2 = set(seq[read.kneepoint * read.step:]) & set( np.where(np.array(read.body['code']) == "yes")[0]) for id in part1 | part2: read.code_error(id, error=error) break if pos < starting or pos + neg < thres: for id in read.BM25_get(): read.code_error(id, error=error) else: a, b, c, d = read.train(weighting=True, pne=True) if stop == 'est': if stopat * read.est_num <= pos: break elif stop == 'soft': if pos >= 10 and pos_last == pos: counter = counter + 1 else: counter = 0 pos_last = pos if counter >= 5: break elif stop == 'knee': if pos >= 10: if read.knee(): if error == 'random': coded = np.where( np.array(read.body['code']) != "undetermined" )[0] seq = coded[np.argsort( np.array(read.body['time'])[coded])] part1 = set( seq[:read.kneepoint * read.step]) & set( np.where( np.array(read.body['code']) == "no") [0]) part2 = set( seq[read.kneepoint * read.step:]) & set( np.where( np.array(read.body['code']) == "yes") [0]) for id in part1 | part2: read.code_error(id, error=error) break else: if pos >= target: break if pos < 10: for id in a: read.code_error(id, error=error) else: for id in c: read.code_error(id, error=error) set_trace() return read