def analyze(pfile_pk, filename, agent_pk, model, db): pfile = -1 try: pfile = int(pfile_pk) except ValueError: print >> sys.stdout, 'ERROR: Provided pfile_pk is not a number: %r' % line return -1 path = libfosspython.repMkPath('files', filename) if (not os.path.exists(path)): print >> sys.stdout, 'ERROR: File not found. path=%s' % (path) return -1 text = open(path).read(READMAX) offsets = library.label_file(text,model) if len(offsets) == 0: sql = """INSERT INTO copyright (agent_fk, pfile_fk, copy_startbyte, copy_endbyte, content, hash, type) VALUES (%d, %d, NULL, NULL, NULL, NULL, 'statement')""" % (agent_pk, pfile) result = db.access(sql) if result != 0: print >> sys.stdout, "ERROR: DB Access error, returned %d.\nERROR: DB STATUS: %s\nERROR: DB ERRMSG: %s\nERROR: sql=%sERROR: filename=%s" % (result, db.status(), db.errmsg(), sql, filename) return -1 else: for i in range(len(offsets)): str = text[offsets[i][0]:offsets[i][1]] str = str.decode('ascii', 'ignore') # make sure that it is using ascii encoding pd = library.parsetext(str) str = re.escape(' '.join([token[1] for token in pd])) sql = """INSERT INTO copyright (agent_fk, pfile_fk, copy_startbyte, copy_endbyte, content, hash, type) VALUES (%d, %d, %d, %d, E'%s', E'%s', '%s')""" % (agent_pk, pfile, offsets[i][0], offsets[i][1], str, hex(abs(hash(str))), offsets[i][2]) result = db.access(sql) if result != 0: print >> sys.stdout, "ERROR: (nonfatal) DB Access error, returned %d.\nERROR: DB STATUS: %s\nERROR: DB ERRMSG: %s\nERROR: sql=%s\nERROR: filename=%s" % (result, db.status(), db.errmsg(), sql, filename) return 0
try: model = pickle.load(open(options.model)) except: print >> sys.stdout, 'You must specify a training file to create a model.\n\n' optparser.print_usage() sys.exit(1) if options.version: print "Source hash: %s" % hex(abs(hash(open(sys.argv[0]).read()))) print 'Model hash: %s' % (model['id']) if options.analyze_from_file: files = [line.rstrip() for line in open(options.analyze_from_file).readlines()] for file in files: text = open(file).read(READMAX) results = library.label_file(text,model) print "%s :: " % (file) if len(results) == 0: print "No copyrights" for i in range(len(results)): print "\t[%d:%d:%s] %r" % (results[i][0], results[i][1], results[i][2], text[results[i][0]:results[i][1]]) if options.analyze_from_command_line: files = args for file in files: text = open(file).read(READMAX) results = library.label_file(text,model) print "%s :: " % (file) if len(results) == 0: print "No copyrights" for i in range(len(results)):