def test_levenshtein(self): eq(distance('a', 'ab'), 1) # number of additions, deletions, updates eq(ratio('a', 'b'), 0) # in [0, 1] eq(ratio('a', 'a'), 1) eq(setratio(['a', 'b'], ['b', 'a']), 1.0) # in [0, 1] compares two sets by best fit, order doesnt matter eq(setratio(['c', 'd'], ['b', 'a']), 0) # in [0, 1] compares two sets by best fit, order doesnt matter eq(seqratio(['a', 'b'], ['b', 'a']), 0.5) # in [0, 1] eq(seqratio(['a', 'b'], ['a', 'b']), 1.0) # in [0, 1] eq(seqratio(['a'], ['a', 'b']), 2 / 3)
def test_evaluate(self): overall_accuracy = {} mail_count = 0 for mail in self.mails_denotated: raw = mail.original_email email_input = EmailMessage(mail.path, mail.filename, ep.Parser().parsestr(raw)) predicted = self.quagga._predict(email_input.clean_body) parsed = self.quagga._parse(predicted, email_input) denotations = mail.denotations denotation_blocks = DenotationBlockConverter.convert(denotations) for i, (parsed_block, annotated_block) in enumerate( zip(parsed['blocks'], denotation_blocks['blocks'])): # since some annotations are not consistent (some have day before date) # we normalize the parsed and the annotated stuff, only the outcome matters anyway annotated_block['sent'] = Normalizer.normalize_sent( annotated_block['sent']) self.clean_parsed_block(parsed_block) self.clean_block(annotated_block) block_accuracy = { 'from': 1, 'to': 1, 'cc': 1, 'sent': 1, 'subject': 1, 'raw_header': 1, 'type': 1, 'text': 1 } if i == 0 and parsed_block[ 'type'] == 'root' and annotated_block['type'] == 'root': block_accuracy['text'] = seqratio(parsed_block['text'], annotated_block['text']) else: block_accuracy['from'] = ratio(parsed_block['raw_from'], annotated_block['from']) block_accuracy['to'] = setratio(parsed_block['raw_to'], annotated_block['to']) block_accuracy['cc'] = setratio(parsed_block['raw_cc'], annotated_block['cc']) block_accuracy['sent'] = ratio(parsed_block['sent'], annotated_block['sent']) block_accuracy['subject'] = ratio( parsed_block['subject'], annotated_block['subject']) block_accuracy['raw_header'] = seqratio( parsed_block['raw_header'], annotated_block['raw_header']) block_accuracy['type'] = ratio(parsed_block['type'], annotated_block['type']) block_accuracy['text'] = seqratio(parsed_block['text'], annotated_block['text']) annotated_block['error'] = block_accuracy if len(parsed['blocks']) != len(denotation_blocks['blocks']): print("blocks have different length, skipping") continue mail_accuracy = { 'from': 0, 'to': 0, 'cc': 0, 'sent': 0, 'subject': 0, 'raw_header': 0, 'type': 0, 'text': 0 } for i, annotated_block in enumerate(denotation_blocks['blocks']): if i == 0: mail_accuracy['text'] += annotated_block['error']['text'] else: for key in annotated_block['error'].keys(): mail_accuracy[key] += annotated_block['error'][key] for key in mail_accuracy.keys(): if key == 'text': mail_accuracy['text'] /= len(denotation_blocks['blocks']) else: if len(denotation_blocks['blocks']) == 1: mail_accuracy[key] = 1 else: mail_accuracy[key] /= len( denotation_blocks['blocks']) - 1 plt.plot(mail_accuracy.keys(), mail_accuracy.values(), label=mail.filename) for key in mail_accuracy.keys(): try: overall_accuracy[key] += mail_accuracy[key] except KeyError: overall_accuracy[key] = mail_accuracy[key] mail_count += 1 for key in overall_accuracy.keys(): overall_accuracy[key] /= mail_count print(overall_accuracy) # plt.legend() plt.show()
def image_match(self, id, image_path, default_match_rate=.79, VERBOSE='SEMI'): #assign append to keep rom_list from being evaluated each iteration rom_list = [] missing_list = [] rom_list_append = rom_list.append missing_append = missing_list.append roms = [os.path.split(os.path.splitext(item)[0])[-1] for item in glob.glob( image_path + '*.*')] print print id, image_path print for index, rom in enumerate(roms): #set minimum match ratio hi_score = default_match_rate best_match_game = None #build search query #we are grabbing any entry that has at least 1 matching search term current_file_search_terms = unicode(self.normalize(rom)) search_query = '%" OR search_terms LIKE "%'.join(current_file_search_terms.split()) for entry in self.GC.execute('SELECT id, search_terms, title FROM image_match WHERE system=' + str(id) + ' AND (search_terms LIKE "%' + search_query + '%")').fetchall(): Lratio = setratio( unicode(current_file_search_terms).split(), entry[1].split() ) if Lratio > hi_score: #check if check to make sure sequels don't get mat5hed to originals if [x for x in current_file_search_terms if x.isdigit()] == [y for y in entry[1] if y.isdigit()]: hi_score = Lratio best_match_game = entry #Let user know current progress if VERBOSE: status = r"%10d/%d roms [%3.2f%%]" % (index+1, len(roms), (index+1) * 100. / len(roms)) status = status + chr(8)*(len(status)+1) sys.stdout.write('%s \r' % (status)) sys.stdout.flush() #in verbose mode: ask if game matches if (VERBOSE == "SEMI" or VERBOSE == "FULL") and hi_score < .94 and best_match_game: best_match_game = best_match_game if self.raw_input_with_timeout('Does %s match %s - %s' % (pcolor('cyan', "["+ rom +"]"), pcolor('cyan', "["+ best_match_game[2] +"]"), pcolor('yellow', "["+"{0:.0f}%".format(float(hi_score) * 100)+"]")), timeout = 10.0) else None if VERBOSE == 'FULL': try: if best_match_game: print 'Closest match for %s is %s - %s' % (pcolor('green', "["+ rom +"]"), pcolor('green', "["+ best_match_game[2] +"]"), pcolor('yellow', "["+"{0:.0f}%".format(float(hi_score) * 100)+"]")) else: print 'No match found for %s' % (pcolor('red', "[" + rom + "]")) except: pass #If a suitable match was found, pull info if best_match_game: rom_list_append(( rom, best_match_game[0] )) else: missing_append(( id, rom )) self.GC.executemany('UPDATE image_match SET image_file=? WHERE id=?', rom_list) self.GC.executemany('INSERT INTO missing_entries (system, title) VALUES (?, ?)', missing_list) self.GAMES.commit() print
def console_match(self, platform, get_rom_name_with_crc, default_match_rate, VERBOSE, RUN_WHOLE_SYSTEM_FOLDER, dont_match): column_names = [item[1] for item in self.GC.execute('PRAGMA table_info(console)').fetchall()] #prepare to get_name for non-arcade find_name = self.get_name(platform['scraper_id'], get_rom_name_with_crc) find_name.send(None) #load rom filenames, initialize rom_list to return matches print 'Fetching %s rom list...' % pcolor('cyan', platform['label']) roms = self.get_stored_roms(platform['rom_path']) #Create Temp table with only currently print 'Connecting to PiPlay Database...' self.GC.execute('DROP TABLE IF EXISTS temp_system') self.GC.execute('CREATE TEMP TABLE temp_system AS SELECT * FROM console WHERE 0') query = 'INSERT INTO temp_system SELECT * FROM console WHERE system in (%s)' % platform['scraper_id'] self.GC.execute(query) self.GAMES.commit() if RUN_WHOLE_SYSTEM_FOLDER: #delete all entries for system query = 'DELETE FROM local_roms WHERE system = {platform_id}'.format(platform_id = platform['id']) self.LC.execute(query) else: if dont_match == False: #delete all entries that no longer have roms + previously unmatched entries query_roms = tuple([x.encode('UTF8') for x in roms]) if len(roms) != 1 else ("('" + roms[0].encode('UTF8') + "')") query = 'DELETE FROM local_roms WHERE system = {0} and (rom_file not in {1} or flags like "%no_match%")'.format( platform['id'], query_roms ) self.LC.execute(query) #remove any remaining entries from list of roms query = 'SELECT rom_file FROM local_roms WHERE system = {platform_id}'.format( platform_id = platform['id']) roms = list( set(roms) - set(item[0] for item in self.LC.execute(query).fetchall()) ) self.LOCAL.commit() if roms: #assign append to keep rom_list from being evaluated each iteration rom_list = [] rom_list_append = rom_list.append for index, rom in enumerate(roms): #get rom name current_file_search_terms = find_name.send(os.path.join(platform['rom_path'], rom)) find_name.send('get_ready') #create run command if platform['include_extension']: build_command = rom else: build_command = os.path.splitext(rom)[0] if platform['include_full_path']: build_command = os.path.join(platform['rom_path'], build_command) game_command = platform['command'] + ' "' + build_command + '"' #update what is already known about current entry game_info = Game(title = rom, system = platform['id'], search_terms = current_file_search_terms, command = game_command, rom_path = platform['rom_path'], rom_file = rom) #set minimum match ratio hi_score = default_match_rate best_match_game = None if dont_match == False: #build search query #we are grabbing any entry that has at least 1 matching search term search_query = '%" OR search_terms LIKE "%'.join(unicode(current_file_search_terms).split()) for entry in self.GC.execute('SELECT id, search_terms, title, system FROM temp_system WHERE (search_terms LIKE "%' + search_query + '%")').fetchall(): Lratio = setratio( unicode(current_file_search_terms).split(), entry[1].split() ) if Lratio > hi_score: #check if check to make sure sequels don't get matched to originals if [x for x in current_file_search_terms if x.isdigit()] == [y for y in entry[1] if y.isdigit()]: hi_score = Lratio best_match_game = entry #if no satisfactory match found, do second pass comparing each letter separately if not best_match_game: for entry in self.GC.execute('SELECT id, search_terms, title, system FROM temp_system WHERE (search_terms LIKE "%' + search_query + '%")').fetchall(): Lratio = setratio( map(unicode,current_file_search_terms), map(unicode, entry[1]) ) if Lratio > hi_score: #check if check to make sure sequels don't get matched to originals if [x for x in current_file_search_terms if x.isdigit()] == [y for y in entry[1] if y.isdigit()]: hi_score = Lratio best_match_game = entry #in verbose mode: ask if game matches if (VERBOSE == "SEMI" or VERBOSE == "FULL") and hi_score < .94 and best_match_game: best_match_game = best_match_game if self.raw_input_with_timeout('Does %s match %s - %s' % (pcolor('cyan', "["+ rom +"]"), pcolor('cyan', "["+ best_match_game[2] +"]"), pcolor('yellow', "["+"{0:.0f}%".format(float(hi_score) * 100)+"]")), timeout = 10.0) else None if VERBOSE == "FULL": if best_match_game: print 'Closest match for %s is %s - %s' % (pcolor('green', "["+ rom +"]"), pcolor('green', "["+ best_match_game[2] +"]"), pcolor('yellow', "["+"{0:.0f}%".format(float(hi_score) * 100)+"]")) else: print 'No match found for %s' % (pcolor('red', "[" + rom + "]")) #Let user know current progress if VERBOSE: status = r"%10d/%d roms [%3.2f%%]" % (index + 1, len(roms), (index + 1) * 100. / len(roms)) status = status + chr(8)*(len(status)+1) sys.stdout.write('%s \r' % (status)) sys.stdout.flush() #If a suitable match was found, pull info if best_match_game: temp_game_info = dict(zip(column_names, self.GC.execute('SELECT * from temp_system where id=?', (best_match_game[0],)).fetchone())) game_info.title = temp_game_info['title'] game_info.search_terms = temp_game_info['search_terms'] game_info.release_date = temp_game_info['release_date'] game_info.overview = temp_game_info['overview'] game_info.esrb = temp_game_info['esrb'] game_info.genres = temp_game_info['genres'] game_info.players = temp_game_info['players'] game_info.coop = temp_game_info['coop'] game_info.publisher = temp_game_info['publisher'] game_info.developer = temp_game_info['developer'] game_info.rating = temp_game_info['rating'] else: game_info.flags = 'no_match,' #if name contains brackets [] with a minus '-' inside, glob will error out if dont_match == False: try: #prefer (user added) image, named same as rom + any extension temp_image_path = glob.glob( os.path.join( os.path.join(platform['rom_path'], 'images/'), os.path.splitext(rom)[0] ) + '.*') game_info.image_file = temp_image_path[0] except: try: #if no rom named image, then find title named image if not game_info.image_file: image_search = self.GC.execute('SELECT image_file FROM image_match WHERE system=? and id=?', (best_match_game[3], best_match_game[0])).fetchone()[0] if image_search: image_search = [os.path.join(platform['rom_path'], 'images/') + image_search + '.*', os.path.join( os.path.join(platform['rom_path'], 'images/'), self.strip_accents(temp_game_info['title']) + '.*')] for image in image_search: temp_image_path.extend( glob.glob( image ) ) game_info.image_file = temp_image_path[0] except: #if no image found, default to rom name with no extension. Boxart thread in romlistscene will try both .jpg and .png extensions game_info.image_file = os.path.join( os.path.join(platform['rom_path'], 'images/'), os.path.splitext(rom)[0] ) else: game_info.image_file = os.path.join( os.path.join(platform['rom_path'], 'images/'), os.path.splitext(rom)[0] ) rom_list_append((game_info.id, game_info.system, game_info.title, game_info.search_terms, None, None, #parent, cloneof -> for arcade game_info.release_date, game_info.overview, game_info.esrb, game_info.genres, game_info.players, game_info.coop, game_info.publisher, game_info.developer, game_info.rating, game_info.command, game_info.rom_file, game_info.rom_path, game_info.image_file, 0, game_info.flags)) self.LC.executemany('INSERT INTO local_roms ' + '(id, system, title, search_terms, parent, cloneof, release_date, overview, esrb, genres, ' + 'players, coop, publisher, developer, rating, command, rom_file, rom_path, image_file, number_of_runs, flags) ' + 'VALUES (' + ('?,' * 21)[:-1] + ')', rom_list) self.LOCAL.commit() print
def matching_distance(self, tokens_a, tokens_b, string_type): """ <tokens_a>: List of Tokens A <tokens_b>: List of Tokens B <string_type>: Type of distance algorithm 0 = Levenshtein edit-distance, 1 = FREE SLOT 2 = FREE SLOT 3 = FREE SLOT 4 = Similarity ratio [Not final] """ _distance_lst = [] _distance_fin = 0.0 # 0: Levenshtein edit-distance if string_type == 0: for _token_a in tokens_a: if tokens_b.count(_token_a) > 0: _distance_lst.append(0.0) else: if len(tokens_a) > 0 and len(tokens_b) > 0: _scores = [] for _token_b in tokens_b: _dist_score = distance(_token_a, _token_b) _scores.append( float(_dist_score) / float(len(_token_a))) # stores the smallest score in <_distance_lst> # Note: smallest score = similar A and B tokens _distance_lst.append(min(_scores)) else: _distance_lst.append(1) for _score in _distance_lst: _distance_fin += _score _distance_fin = float(_distance_fin / float(len(_distance_lst))) if len(tokens_a) < len(tokens_b): _distance_fin = _distance_fin + ( float(len(tokens_b) - len(tokens_a)) / 10.0) if len(tokens_a) > len(tokens_b): _distance_fin = _distance_fin + ( float(len(tokens_a) - len(tokens_b)) / 10.0) if _distance_fin > 1: _distance_fin = 0 else: _distance_fin = 1 - _distance_fin # 1: FREE SLOT if string_type == 1: #for _token_a in tokens_a: # if tokens_b.count(_token_a) > 0: # _distance_lst.append(0.0) # else: # if len(tokens_a)>0 and len(tokens_b)>0: # _scores = [] # for _token_b in tokens_b: # _dist_score = edit_distance(_token_a, # _token_b, # transpositions=True) # _scores.append(float(_dist_score)/float(len(_token_a))) # # stores the smallest score in <_distance_lst> # # Note: smallest score = similar A and B tokens # _distance_lst.append(min(_scores)) # else: # _distance_lst.append(1) #for _score in _distance_lst: # _distance_fin += _score #_distance_fin = float(_distance_fin / float(len(_distance_lst))) #if len(tokens_a) < len(tokens_b): # _distance_fin = _distance_fin + (float(len(tokens_b) - len(tokens_a))/10.0) #if len(tokens_a) > len(tokens_b): # _distance_fin = _distance_fin + (float(len(tokens_a) - len(tokens_b))/10.0) pass # 2: Jaccard distance if string_type == 2: #_distance_fin = jaccard_distance (set(tokens_a), set(tokens_b)) pass # 3: Measuring Agreement on Set-Valued Items (MASI) if string_type == 3: #_distance_fin = masi_distance(set(tokens_a), set(tokens_b)) pass # 4: Similarity ratio - Compute similarity ratio of two strings sets. # The best match between any strings in the first set and the second set # (passed as sequences) is attempted. I.e., the order doesn't matter here. if string_type == 4: _distance_fin = setratio(tokens_a, tokens_b) return _distance_fin