Ejemplo n.º 1
0
def active_learning(filename,
                    query='',
                    stop='true',
                    stopat=0.95,
                    error='none',
                    interval=100000,
                    seed=0):
    stopat = float(stopat)
    thres = 0
    starting = 1
    counter = 0
    pos_last = 0
    np.random.seed(seed)
    read = MAR()
    read = read.create(filename)
    read.interval = interval

    read.BM25(query.strip().split('_'))

    num2 = read.get_allpos()
    target = int(num2 * stopat)
    print("number of target, true/close here:", target)

    if stop == 'est':
        read.enable_est = True
    else:
        read.enable_est = False

    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" % (pos, pos + neg, read.est_num))
        except:
            print("%d, %d" % (pos, pos + neg))

        if pos + neg >= total:
            if stop == 'knee' and error == 'random':
                coded = np.where(
                    np.array(read.body['code']) != "undetermined")[0]
                seq = coded[np.argsort(read.body['time'][coded])]
                part1 = set(seq[:read.kneepoint * read.step]) & set(
                    np.where(np.array(read.body['code']) == "no")[0])
                part2 = set(seq[read.kneepoint * read.step:]) & set(
                    np.where(np.array(read.body['code']) == "yes")[0])
                for id in part1 | part2:
                    read.code_error(id, error=error)
            break

        if pos < starting or pos + neg < thres:
            for id in read.BM25_get():
                read.code_error(id, error=error)
        else:
            a, b, c, d = read.train(weighting=True, pne=True)
            if stop == 'est':
                if stopat * read.est_num <= pos:
                    break
            elif stop == 'soft':
                if pos >= 10 and pos_last == pos:
                    counter = counter + 1
                else:
                    counter = 0
                pos_last = pos
                if counter >= 5:
                    break
            elif stop == 'knee':
                if pos >= 10:
                    if read.knee():
                        if error == 'random':
                            coded = np.where(
                                np.array(read.body['code']) != "undetermined"
                            )[0]
                            seq = coded[np.argsort(
                                np.array(read.body['time'])[coded])]
                            part1 = set(
                                seq[:read.kneepoint * read.step]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "no")
                                    [0])
                            part2 = set(
                                seq[read.kneepoint * read.step:]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "yes")
                                    [0])
                            for id in part1 | part2:
                                read.code_error(id, error=error)
                        break
            else:
                if pos >= target:
                    break
            if pos < 10:
                for id in a:
                    read.code_error(id, error=error)
            else:
                for id in c:
                    read.code_error(id, error=error)
    set_trace()
    return read
Ejemplo n.º 2
0
def active_learning(filename,
                    query='',
                    stop='true',
                    stopat=1.00,
                    error='none',
                    interval=100000,
                    seed=0):
    stopat = float(stopat)
    thres = 0
    starting = 1
    counter = 0
    pos_last = 0
    np.random.seed(seed)

    read = MAR()
    read = read.create(filename)
    # random sampling or by querying similar documents
    # self.bm is provided with a list or a view of a dict's value which is not sorted
    read.BM25(query.strip().split('_'))

    # get the rest #pos documents
    num2 = read.get_allpos()
    target = int(
        num2 * stopat
    )  # stopat is 1. Is it the minum num of pos to activate svm training ?
    if stop == 'est':  # stop = 'true'
        read.enable_est = True
    else:
        read.enable_est = False  # will excute this line

    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" %
                  (pos, pos + neg, read.est_num))  # what is est_num ?
        except:
            print("%d, %d" % (pos, pos + neg))  # execute this line

        if pos + neg >= total:  # do not go inside
            if stop == 'knee' and error == 'random':
                coded = np.where(
                    np.array(read.body['code']) != "undetermined")[0]
                seq = coded[np.argsort(read.body['time'][coded])]
                part1 = set(seq[:read.kneepoint * read.step]) & set(
                    np.where(np.array(read.body['code']) == "no")[0])
                part2 = set(seq[read.kneepoint * read.step:]) & set(
                    np.where(np.array(read.body['code']) == "yes")[0])
                for id in part1 | part2:
                    read.code_error(id, error=error)
            break

        if pos < starting or pos + neg < thres:  # the second condition doesn't work
            for id in read.BM25_get(
            ):  # select a set of candidates from self.pool
                read.code_error(
                    id, error=error
                )  # simulate human labeling error, default is no error
        else:
            a, b, c, d = read.train(weighting=True, pne=True)
            if stop == 'est':
                if stopat * read.est_num <= pos:
                    break
            elif stop == 'soft':
                if pos >= 10 and pos_last == pos:
                    counter = counter + 1
                else:
                    counter = 0
                pos_last = pos
                if counter >= 5:
                    break
            elif stop == 'knee':
                if pos >= 10:
                    if read.knee():
                        if error == 'random':
                            coded = np.where(
                                np.array(read.body['code']) != "undetermined"
                            )[0]
                            seq = coded[np.argsort(
                                np.array(read.body['time'])[coded])]
                            part1 = set(
                                seq[:read.kneepoint * read.step]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "no")
                                    [0])
                            part2 = set(
                                seq[read.kneepoint * read.step:]) & set(
                                    np.where(
                                        np.array(read.body['code']) == "yes")
                                    [0])
                            for id in part1 | part2:
                                read.code_error(id, error=error)
                        break
            else:
                if pos >= target:
                    break
            if pos < 10:
                for id in a:
                    read.code_error(id, error=error)
            else:
                for id in c:
                    read.code_error(id, error=error)
    return read
Ejemplo n.º 3
0
root = os.getcwd().split("src")[0] + "src/src/util"
sys.path.append(root)
from mar import MAR
from pdb import set_trace

if __name__ == "__main__":
    data_path = "Hall.csv"
    target_recall = 0.95
    thres = 10
    query = "defect prediction"
    read = MAR()
    read = read.create(data_path)
    read.enable_est = True
    if query:
        read.BM25(query.split())
    while True:
        pos, neg, total = read.get_numbers()
        try:
            print("%d, %d, %d" % (pos, pos + neg, read.est_num))
        except:
            print("%d, %d" % (pos, pos + neg))

        if pos + neg >= total:
            break

        if pos < 1:
            if query:
                ids, scores = read.BM25_get()
                for id in ids:
                    read.code(id, read.body["label"][id])