def analyze(pfile_pk, filename, agent_pk, model, db): pfile = -1 try: pfile = int(pfile_pk) except ValueError: print >> sys.stdout, 'ERROR: Provided pfile_pk is not a number: %r' % line return -1 path = libfosspython.repMkPath('files', filename) if (not os.path.exists(path)): print >> sys.stdout, 'ERROR: File not found. path=%s' % (path) return -1 text = open(path).read(READMAX) offsets = library.label_file(text,model) if len(offsets) == 0: sql = """INSERT INTO copyright (agent_fk, pfile_fk, copy_startbyte, copy_endbyte, content, hash, type) VALUES (%d, %d, NULL, NULL, NULL, NULL, 'statement')""" % (agent_pk, pfile) result = db.access(sql) if result != 0: print >> sys.stdout, "ERROR: DB Access error, returned %d.\nERROR: DB STATUS: %s\nERROR: DB ERRMSG: %s\nERROR: sql=%sERROR: filename=%s" % (result, db.status(), db.errmsg(), sql, filename) return -1 else: for i in range(len(offsets)): str = text[offsets[i][0]:offsets[i][1]] str = str.decode('ascii', 'ignore') # make sure that it is using ascii encoding pd = library.parsetext(str) str = re.escape(' '.join([token[1] for token in pd])) sql = """INSERT INTO copyright (agent_fk, pfile_fk, copy_startbyte, copy_endbyte, content, hash, type) VALUES (%d, %d, %d, %d, E'%s', E'%s', '%s')""" % (agent_pk, pfile, offsets[i][0], offsets[i][1], str, hex(abs(hash(str))), offsets[i][2]) result = db.access(sql) if result != 0: print >> sys.stdout, "ERROR: (nonfatal) DB Access error, returned %d.\nERROR: DB STATUS: %s\nERROR: DB ERRMSG: %s\nERROR: sql=%s\nERROR: filename=%s" % (result, db.status(), db.errmsg(), sql, filename) return 0
def crossvalidation(data_file, folds=10): """ Performs cross validation on set of data and returns the results. This is a randomized algorithms so you should NOT use if to regression testing. data_file should be a file with each line containing a triple quoted canonical string representation, i.e. call repr on the string. The text to be hi-lighted should be wrapped in <s>...</s> tags. Outputs a dictionary that holds specific statistics about the performance of the classifier. """ if folds <= 1: raise Exception("Number of folds is too small. A value greater than 1 is required.") training_data = [eval(line) for line in open(data_file).readlines()] N = len(training_data) if N < folds: raise Exception("Number of folds is greater than number of data points.") # The approximate number of data items in each fold n = int(round(N/float(folds))) # shuffle the training data so we dont have any funky correlation issues # with its ordering. random.shuffle(training_data) parsed_data = [library.parsetext(text) for text in training_data] tokens = [[parsed_data[i][j][0] for j in xrange(len(parsed_data[i]))] for i in xrange(N)] bio_data = [library.tokens_to_BIO(tokens[i]) for i in xrange(N)] fold_index = [] for i in range(folds): fold_index.append(range(i*n,min([n+n*i,N]))) accuracy = 0.0 classes = ['B', 'I', 'O'] matrix = {'B':{'B':0.0, 'I':0.0, 'O':0.0}, 'I':{'B':0.0, 'I':0.0, 'O':0.0}, 'O':{'B':0.0, 'I':0.0, 'O':0.0}} for i in range(folds): print "Fold %d." % i testing = fold_index[i] training = list(set(testing).symmetric_difference(set(range(N)))) testing_data = [bio_data[d] for d in testing] training_data = [bio_data[d] for d in training] PFC = library.tuned_model(training_data) passed = 0 for test in testing_data: tokens = test[0] labels = test[1] out = library.label_nb(PFC, tokens) if out == labels: passed += 1 for l in range(len(labels)): matrix[labels[l]][out[l]] += 1.0 accuracy += passed/float(len(testing)) raw_accuracy = accuracy/float(folds) total = sum([sum([matrix[c1][c2] for c2 in classes]) for c1 in classes]) for c1 in classes: s = sum([matrix[c1][c2] for c2 in classes]) for c2 in matrix[c1]: matrix[c1][c2] = matrix[c1][c2]/s print "Raw Accuracy: %1.2f" % raw_accuracy print "------------------------" print " | B | I | O " print "------------------------" print " B | %1.2f | %1.2f | %1.2f " % (matrix['B']['B'], matrix['B']['I'], matrix['B']['O']) print "------------------------" print " I | %1.2f | %1.2f | %1.2f " % (matrix['I']['B'], matrix['I']['I'], matrix['I']['O']) print "------------------------" print " O | %1.2f | %1.2f | %1.2f " % (matrix['O']['B'], matrix['O']['I'], matrix['O']['O']) print "------------------------" return {'raw accuracy':raw_accuracy, 'matrix':matrix}
def crossvalidation(data_file, folds=10): """ Performs cross validation on set of data and returns the results. This is a randomized algorithms so you should NOT use if to regression testing. data_file should be a file with each line containing a triple quoted canonical string representation, i.e. call repr on the string. The text to be hi-lighted should be wrapped in <s>...</s> tags. Outputs a dictionary that holds specific statistics about the performance of the classifier. """ if folds <= 1: raise Exception( "Number of folds is too small. A value greater than 1 is required." ) training_data = [eval(line) for line in open(data_file).readlines()] N = len(training_data) if N < folds: raise Exception( "Number of folds is greater than number of data points.") # The approximate number of data items in each fold n = int(round(N / float(folds))) # shuffle the training data so we dont have any funky correlation issues # with its ordering. random.shuffle(training_data) parsed_data = [library.parsetext(text) for text in training_data] tokens = [[parsed_data[i][j][0] for j in xrange(len(parsed_data[i]))] for i in xrange(N)] bio_data = [library.tokens_to_BIO(tokens[i]) for i in xrange(N)] fold_index = [] for i in range(folds): fold_index.append(range(i * n, min([n + n * i, N]))) accuracy = 0.0 classes = ['B', 'I', 'O'] matrix = { 'B': { 'B': 0.0, 'I': 0.0, 'O': 0.0 }, 'I': { 'B': 0.0, 'I': 0.0, 'O': 0.0 }, 'O': { 'B': 0.0, 'I': 0.0, 'O': 0.0 } } for i in range(folds): print "Fold %d." % i testing = fold_index[i] training = list(set(testing).symmetric_difference(set(range(N)))) testing_data = [bio_data[d] for d in testing] training_data = [bio_data[d] for d in training] PFC = library.tuned_model(training_data) passed = 0 for test in testing_data: tokens = test[0] labels = test[1] out = library.label_nb(PFC, tokens) if out == labels: passed += 1 for l in range(len(labels)): matrix[labels[l]][out[l]] += 1.0 accuracy += passed / float(len(testing)) raw_accuracy = accuracy / float(folds) total = sum([sum([matrix[c1][c2] for c2 in classes]) for c1 in classes]) for c1 in classes: s = sum([matrix[c1][c2] for c2 in classes]) for c2 in matrix[c1]: matrix[c1][c2] = matrix[c1][c2] / s print "Raw Accuracy: %1.2f" % raw_accuracy print "------------------------" print " | B | I | O " print "------------------------" print " B | %1.2f | %1.2f | %1.2f " % (matrix['B']['B'], matrix['B']['I'], matrix['B']['O']) print "------------------------" print " I | %1.2f | %1.2f | %1.2f " % (matrix['I']['B'], matrix['I']['I'], matrix['I']['O']) print "------------------------" print " O | %1.2f | %1.2f | %1.2f " % (matrix['O']['B'], matrix['O']['I'], matrix['O']['O']) print "------------------------" return {'raw accuracy': raw_accuracy, 'matrix': matrix}