Beispiel #1
0
def analyze(pfile_pk, filename, agent_pk, model, db):
    pfile = -1
    try:
        pfile = int(pfile_pk)
    except ValueError:
        print >> sys.stdout, 'ERROR: Provided pfile_pk is not a number: %r' % line
        return -1

    path = libfosspython.repMkPath('files', filename)
    if (not os.path.exists(path)):
        print >> sys.stdout, 'ERROR: File not found. path=%s' % (path)
        return -1
    text = open(path).read(READMAX)
    offsets = library.label_file(text,model)
    if len(offsets) == 0:
        sql = """INSERT INTO copyright (agent_fk, pfile_fk, copy_startbyte, copy_endbyte, content, hash, type)
                 VALUES (%d, %d, NULL, NULL, NULL, NULL, 'statement')""" % (agent_pk, pfile)
        result = db.access(sql)
        if result != 0:
            print >> sys.stdout, "ERROR: DB Access error, returned %d.\nERROR: DB STATUS: %s\nERROR: DB ERRMSG: %s\nERROR: sql=%sERROR: filename=%s" % (result, db.status(), db.errmsg(), sql, filename)
            return -1
    else:
        for i in range(len(offsets)):
            str = text[offsets[i][0]:offsets[i][1]]
            str = str.decode('ascii', 'ignore')     # make sure that it is using ascii encoding
            pd = library.parsetext(str)
            str = re.escape(' '.join([token[1] for token in pd]))
            sql = """INSERT INTO copyright (agent_fk, pfile_fk, copy_startbyte, copy_endbyte, content, hash, type)
                     VALUES (%d, %d, %d, %d, E'%s', E'%s', '%s')""" % (agent_pk, pfile, offsets[i][0], offsets[i][1],
                        str, hex(abs(hash(str))),
                        offsets[i][2])
            result = db.access(sql)
            if result != 0:
                print >> sys.stdout, "ERROR: (nonfatal) DB Access error, returned %d.\nERROR: DB STATUS: %s\nERROR: DB ERRMSG: %s\nERROR: sql=%s\nERROR: filename=%s" % (result, db.status(), db.errmsg(), sql, filename)

    return 0
Beispiel #2
0
def crossvalidation(data_file, folds=10):
    """
    Performs cross validation on set of data and returns the results.

    This is a randomized algorithms so you should NOT use if to 
    regression testing.

    data_file should be a file with each line containing a triple quoted 
    canonical string representation, i.e. call repr on the string.
    The text to be hi-lighted should be wrapped in <s>...</s> tags.

    Outputs a dictionary that holds specific statistics about the performance
    of the classifier.
    """
    
    if folds <= 1:
        raise Exception("Number of folds is too small. A value greater than 1 is required.")

    training_data = [eval(line) for line in open(data_file).readlines()]
    N = len(training_data)
    if N < folds:
        raise Exception("Number of folds is greater than number of data points.")

    # The approximate number of data items in each fold
    n = int(round(N/float(folds)))

    # shuffle the training data so we dont have any funky correlation issues
    # with its ordering.
    random.shuffle(training_data)

    parsed_data = [library.parsetext(text) for text in training_data]
    tokens = [[parsed_data[i][j][0] for j in xrange(len(parsed_data[i]))] for i in xrange(N)]
    bio_data = [library.tokens_to_BIO(tokens[i]) for i in xrange(N)]

    fold_index = []
    for i in range(folds):
        fold_index.append(range(i*n,min([n+n*i,N])))

    accuracy = 0.0
    classes = ['B', 'I', 'O']
    matrix = {'B':{'B':0.0, 'I':0.0, 'O':0.0}, 'I':{'B':0.0, 'I':0.0, 'O':0.0}, 'O':{'B':0.0, 'I':0.0, 'O':0.0}}

    for i in range(folds):
        print "Fold %d." % i
        testing = fold_index[i]
        training = list(set(testing).symmetric_difference(set(range(N))))

        testing_data = [bio_data[d] for d in testing]
        training_data = [bio_data[d] for d in training]

        PFC = library.tuned_model(training_data)

        passed = 0
        for test in testing_data:
            tokens = test[0]
            labels = test[1]

            out = library.label_nb(PFC, tokens)

            if out == labels:
                passed += 1

            for l in range(len(labels)):
                matrix[labels[l]][out[l]] += 1.0

        accuracy += passed/float(len(testing))

    raw_accuracy = accuracy/float(folds)

    total = sum([sum([matrix[c1][c2] for c2 in classes]) for c1 in classes])
    for c1 in classes:
        s = sum([matrix[c1][c2] for c2 in classes])
        for c2 in matrix[c1]:
            matrix[c1][c2] = matrix[c1][c2]/s

    print "Raw Accuracy: %1.2f" % raw_accuracy
    print "------------------------"
    print "   |   B  |   I  |   O  "
    print "------------------------"
    print " B | %1.2f | %1.2f | %1.2f " % (matrix['B']['B'], matrix['B']['I'], matrix['B']['O'])
    print "------------------------"
    print " I | %1.2f | %1.2f | %1.2f " % (matrix['I']['B'], matrix['I']['I'], matrix['I']['O'])
    print "------------------------"
    print " O | %1.2f | %1.2f | %1.2f " % (matrix['O']['B'], matrix['O']['I'], matrix['O']['O'])
    print "------------------------"

    return {'raw accuracy':raw_accuracy, 'matrix':matrix}
Beispiel #3
0
def crossvalidation(data_file, folds=10):
    """
    Performs cross validation on set of data and returns the results.

    This is a randomized algorithms so you should NOT use if to 
    regression testing.

    data_file should be a file with each line containing a triple quoted 
    canonical string representation, i.e. call repr on the string.
    The text to be hi-lighted should be wrapped in <s>...</s> tags.

    Outputs a dictionary that holds specific statistics about the performance
    of the classifier.
    """

    if folds <= 1:
        raise Exception(
            "Number of folds is too small. A value greater than 1 is required."
        )

    training_data = [eval(line) for line in open(data_file).readlines()]
    N = len(training_data)
    if N < folds:
        raise Exception(
            "Number of folds is greater than number of data points.")

    # The approximate number of data items in each fold
    n = int(round(N / float(folds)))

    # shuffle the training data so we dont have any funky correlation issues
    # with its ordering.
    random.shuffle(training_data)

    parsed_data = [library.parsetext(text) for text in training_data]
    tokens = [[parsed_data[i][j][0] for j in xrange(len(parsed_data[i]))]
              for i in xrange(N)]
    bio_data = [library.tokens_to_BIO(tokens[i]) for i in xrange(N)]

    fold_index = []
    for i in range(folds):
        fold_index.append(range(i * n, min([n + n * i, N])))

    accuracy = 0.0
    classes = ['B', 'I', 'O']
    matrix = {
        'B': {
            'B': 0.0,
            'I': 0.0,
            'O': 0.0
        },
        'I': {
            'B': 0.0,
            'I': 0.0,
            'O': 0.0
        },
        'O': {
            'B': 0.0,
            'I': 0.0,
            'O': 0.0
        }
    }

    for i in range(folds):
        print "Fold %d." % i
        testing = fold_index[i]
        training = list(set(testing).symmetric_difference(set(range(N))))

        testing_data = [bio_data[d] for d in testing]
        training_data = [bio_data[d] for d in training]

        PFC = library.tuned_model(training_data)

        passed = 0
        for test in testing_data:
            tokens = test[0]
            labels = test[1]

            out = library.label_nb(PFC, tokens)

            if out == labels:
                passed += 1

            for l in range(len(labels)):
                matrix[labels[l]][out[l]] += 1.0

        accuracy += passed / float(len(testing))

    raw_accuracy = accuracy / float(folds)

    total = sum([sum([matrix[c1][c2] for c2 in classes]) for c1 in classes])
    for c1 in classes:
        s = sum([matrix[c1][c2] for c2 in classes])
        for c2 in matrix[c1]:
            matrix[c1][c2] = matrix[c1][c2] / s

    print "Raw Accuracy: %1.2f" % raw_accuracy
    print "------------------------"
    print "   |   B  |   I  |   O  "
    print "------------------------"
    print " B | %1.2f | %1.2f | %1.2f " % (matrix['B']['B'], matrix['B']['I'],
                                           matrix['B']['O'])
    print "------------------------"
    print " I | %1.2f | %1.2f | %1.2f " % (matrix['I']['B'], matrix['I']['I'],
                                           matrix['I']['O'])
    print "------------------------"
    print " O | %1.2f | %1.2f | %1.2f " % (matrix['O']['B'], matrix['O']['I'],
                                           matrix['O']['O'])
    print "------------------------"

    return {'raw accuracy': raw_accuracy, 'matrix': matrix}