def search( self ): ca = self.comic_archive self.match_list = [] self.cancel = False self.search_result = self.ResultNoMatches if not pil_available: self.log_msg( "Python Imaging Library (PIL) is not available and is needed for issue identification." ) return self.match_list if not ca.seemsToBeAComicArchive(): self.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!") return self.match_list cover_image_data = ca.getPage( self.cover_page_index ) cover_hash = self.calculateHash( cover_image_data ) #check the apect ratio # if it's wider than it is high, it's probably a two page spread # if so, crop it and calculate a second hash narrow_cover_hash = None aspect_ratio = self.getAspectRatio( cover_image_data ) if aspect_ratio < 1.0: right_side_image_data = self.cropCover( cover_image_data ) if right_side_image_data is not None: narrow_cover_hash = self.calculateHash( right_side_image_data ) #self.log_msg( "Cover hash = {0:016x}".format(cover_hash) ) keys = self.getSearchKeys() #normalize the issue number keys['issue_number'] = IssueString(keys['issue_number']).asString() # we need, at minimum, a series and issue number if keys['series'] is None or keys['issue_number'] is None: self.log_msg("Not enough info for a search!") return [] self.log_msg( "Going to search for:" ) self.log_msg( "\tSeries: " + keys['series'] ) self.log_msg( "\tIssue : " + keys['issue_number'] ) if keys['issue_count'] is not None: self.log_msg( "\tCount : " + str(keys['issue_count']) ) if keys['year'] is not None: self.log_msg( "\tYear : " + str(keys['year']) ) if keys['month'] is not None: self.log_msg( "\tMonth : " + str(keys['month']) ) #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist)) comicVine = ComicVineTalker( ) comicVine.setLogFunc( self.output_function ) #self.log_msg( ( "Searching for " + keys['series'] + "...") self.log_msg( u"Searching for {0} #{1} ...".format( keys['series'], keys['issue_number']) ) try: cv_search_results = comicVine.searchForSeries( keys['series'] ) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series. Aborting...") return [] #self.log_msg( "Found " + str(len(cv_search_results)) + " initial results" ) if self.cancel == True: return [] series_second_round_list = [] #self.log_msg( "Removing results with too long names, banned publishers, or future start dates" ) for item in cv_search_results: length_approved = False publisher_approved = True date_approved = True # remove any series that starts after the issue year if keys['year'] is not None and str(keys['year']).isdigit() and item['start_year'] is not None and str(item['start_year']).isdigit(): if int(keys['year']) < int(item['start_year']): date_approved = False #assume that our search name is close to the actual name, say within ,e.g. 5 chars shortened_key = utils.removearticles(keys['series']) shortened_item_name = utils.removearticles(item['name']) if len( shortened_item_name ) < ( len( shortened_key ) + self.length_delta_thresh) : length_approved = True # remove any series from publishers on the blacklist if item['publisher'] is not None: publisher = item['publisher']['name'] if publisher is not None and publisher.lower() in self.publisher_blacklist: publisher_approved = False if length_approved and publisher_approved and date_approved: series_second_round_list.append(item) self.log_msg( "Searching in " + str(len(series_second_round_list)) +" series" ) if self.callback is not None: self.callback( 0, len(series_second_round_list)) # now sort the list by name length series_second_round_list.sort(key=lambda x: len(x['name']), reverse=False) #build a list of volume IDs volume_id_list = list() for series in series_second_round_list: volume_id_list.append( series['id']) try: issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear( volume_id_list, keys['issue_number'], keys['year']) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series details. Aborting...") return [] shortlist = list() #now re-associate the issues and volumes for issue in issue_list: for series in series_second_round_list: if series['id'] == issue['volume']['id']: shortlist.append( (series, issue) ) break if keys['year'] is None: self.log_msg( u"Found {0} series that have an issue #{1}".format(len(shortlist), keys['issue_number']) ) else: self.log_msg( u"Found {0} series that have an issue #{1} from {2}".format(len(shortlist), keys['issue_number'], keys['year'] )) # now we have a shortlist of volumes with the desired issue number # Do first round of cover matching counter = len(shortlist) for series, issue in shortlist: if self.callback is not None: self.callback( counter, len(shortlist)*3) counter += 1 self.log_msg( u"Examining covers for ID: {0} {1} ({2}) ...".format( series['id'], series['name'], series['start_year']), newline=False ) # parse out the cover date day, month, year = comicVine.parseDateStr( issue['cover_date'] ) # Now check the cover match against the primary image hash_list = [ cover_hash ] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) try: image_url = issue['image']['super_url'] thumb_url = issue['image']['thumb_url'] page_url = issue['site_detail_url'] score_item = self.getIssueCoverMatchScore( comicVine, issue['id'], image_url, thumb_url, page_url, hash_list, useRemoteAlternates = False ) except: self.match_list = [] return self.match_list match = dict() match['series'] = u"{0} ({1})".format(series['name'], series['start_year']) match['distance'] = score_item['score'] match['issue_number'] = keys['issue_number'] match['cv_issue_count'] = series['count_of_issues'] match['url_image_hash'] = score_item['hash'] match['issue_title'] = issue['name'] match['issue_id'] = issue['id'] match['volume_id'] = series['id'] match['month'] = month match['year'] = year match['publisher'] = None if series['publisher'] is not None: match['publisher'] = series['publisher']['name'] match['image_url'] = image_url match['thumb_url'] = thumb_url match['page_url'] = page_url match['description'] = issue['description'] self.match_list.append(match) self.log_msg( " --> {0}".format(match['distance']), newline=False ) self.log_msg( "" ) if len(self.match_list) == 0: self.log_msg( ":-( no matches!" ) self.search_result = self.ResultNoMatches return self.match_list # sort list by image match scores self.match_list.sort(key=lambda k: k['distance']) l = [] for i in self.match_list: l.append( i['distance'] ) self.log_msg( "Compared to covers in {0} issue(s):".format(len(self.match_list)), newline=False) self.log_msg( str(l)) def print_match(item): self.log_msg( u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format( item['series'], item['issue_number'], item['issue_title'], item['month'], item['year'], item['distance']) ) best_score = self.match_list[0]['distance'] if best_score >= self.min_score_thresh: # we have 1 or more low-confidence matches (all bad cover scores) # look at a few more pages in the archive, and also alternate covers online self.log_msg( "Very weak scores for the cover. Analyzing alternate pages and covers..." ) hash_list = [ cover_hash ] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) for i in range( 1, min(3, ca.getNumberOfPages())): image_data = ca.getPage(i) page_hash = self.calculateHash( image_data ) hash_list.append( page_hash ) second_match_list = [] counter = 2*len(self.match_list) for m in self.match_list: if self.callback is not None: self.callback( counter, len(self.match_list)*3) counter += 1 self.log_msg( u"Examining alternate covers for ID: {0} {1} ...".format( m['volume_id'], m['series']), newline=False ) try: score_item = self.getIssueCoverMatchScore( comicVine, m['issue_id'], m['image_url'], m['thumb_url'], m['page_url'], hash_list, useRemoteAlternates = True ) except: self.match_list = [] return self.match_list self.log_msg("--->{0}".format(score_item['score'])) self.log_msg( "" ) if score_item['score'] < self.min_alternate_score_thresh: second_match_list.append(m) m['distance'] = score_item['score'] if len( second_match_list ) == 0: if len( self.match_list) == 1: self.log_msg( "No matching pages in the issue." ) self.log_msg( u"--------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultFoundMatchButBadCoverScore else: self.log_msg( u"--------------------------------------------------") self.log_msg( u"Multiple bad cover matches! Need to use other info..." ) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultMultipleMatchesWithBadImageScores return self.match_list else: # We did good, found something! self.log_msg( "Success in secondary/alternate cover matching!" ) self.match_list = second_match_list # sort new list by image match scores self.match_list.sort(key=lambda k: k['distance']) best_score = self.match_list[0]['distance'] self.log_msg("[Second round cover matching: best score = {0}]".format(best_score)) # now drop down into the rest of the processing if self.callback is not None: self.callback( 99, 100) #now pare down list, remove any item more than specified distant from the top scores for item in reversed(self.match_list): if item['distance'] > best_score + self.min_score_distance: self.match_list.remove(item) # One more test for the case choosing limited series first issue vs a trade with the same cover: # if we have a given issue count > 1 and the volume from CV has count==1, remove it from match list if len(self.match_list) >= 2 and keys['issue_count'] is not None and keys['issue_count'] != 1: new_list = list() for match in self.match_list: if match['cv_issue_count'] != 1: new_list.append(match) else: self.log_msg("Removing volume {0} [{1}] from consideration (only 1 issue)".format(match['series'], match['volume_id'])) if len(new_list) > 0: self.match_list = new_list if len(self.match_list) == 1: self.log_msg( u"--------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultOneGoodMatch elif len(self.match_list) == 0: self.log_msg( u"--------------------------------------------------") self.log_msg( "No matches found :(" ) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultNoMatches else: # we've got multiple good matches: self.log_msg( "More than one likley candiate." ) self.search_result = self.ResultMultipleGoodMatches self.log_msg( u"--------------------------------------------------") for item in self.match_list: print_match(item) self.log_msg( u"--------------------------------------------------") return self.match_list
def searchForSeries(self, series_name, callback=None, refresh_cache=False): # remove cruft from the search string series_name = utils.removearticles(series_name).lower().strip() # before we search online, look in our cache, since we might have # done this same search recently cvc = ComicVineCacher() if not refresh_cache: cached_search_results = cvc.get_search_results(series_name) if len(cached_search_results) > 0: return cached_search_results original_series_name = series_name # We need to make the series name into an "AND"ed query list query_word_list = series_name.split() and_list = ['AND'] * (len(query_word_list) - 1) and_list.append('') # zipper up the two lists query_list = zip(query_word_list, and_list) # flatten the list query_list = [item for sublist in query_list for item in sublist] # convert back to a string query_string = " ".join(query_list).strip() # print "Query string = ", query_string query_string = urllib.quote_plus(query_string.encode("utf-8")) search_url = self.api_base_url + "/search/?api_key=" + self.api_key + "&format=json&resources=volume&query=" + \ query_string + \ "&field_list=name,id,start_year,publisher,image,description,count_of_issues" cv_response = self.getCVContent(search_url + "&page=1") search_results = list() # see http://api.comicvine.com/documentation/#handling_responses limit = cv_response['limit'] current_result_count = cv_response['number_of_page_results'] total_result_count = cv_response['number_of_total_results'] if callback is None: self.writeLog( "Found {0} of {1} results\n".format( cv_response['number_of_page_results'], cv_response['number_of_total_results'])) search_results.extend(cv_response['results']) page = 1 if callback is not None: callback(current_result_count, total_result_count) # see if we need to keep asking for more pages... while (current_result_count < total_result_count): if callback is None: self.writeLog( "getting another page of results {0} of {1}...\n".format( current_result_count, total_result_count)) page += 1 cv_response = self.getCVContent(search_url + "&page=" + str(page)) search_results.extend(cv_response['results']) current_result_count += cv_response['number_of_page_results'] if callback is not None: callback(current_result_count, total_result_count) # for record in search_results: #print(u"{0}: {1} ({2})".format(record['id'], record['name'] , record['start_year'])) # print(record) #record['count_of_issues'] = record['count_of_isssues'] #print(u"{0}: {1} ({2})".format(search_results['results'][0]['id'], search_results['results'][0]['name'] , search_results['results'][0]['start_year'])) # cache these search results cvc.add_search_results(original_series_name, search_results) return search_results
def search(self): ca = self.comic_archive self.match_list = [] self.cancel = False self.search_result = self.ResultNoMatches if not pil_available: self.log_msg( "Python Imaging Library (PIL) is not available and is needed for issue identification.") return self.match_list if not ca.seemsToBeAComicArchive(): self.log_msg( "Sorry, but " + opts.filename + " is not a comic archive!") return self.match_list cover_image_data = ca.getPage(self.cover_page_index) cover_hash = self.calculateHash(cover_image_data) # check the aspect ratio # if it's wider than it is high, it's probably a two page spread # if so, crop it and calculate a second hash narrow_cover_hash = None aspect_ratio = self.getAspectRatio(cover_image_data) if aspect_ratio < 1.0: right_side_image_data = self.cropCover(cover_image_data) if right_side_image_data is not None: narrow_cover_hash = self.calculateHash(right_side_image_data) #self.log_msg("Cover hash = {0:016x}".format(cover_hash)) keys = self.getSearchKeys() # normalize the issue number keys['issue_number'] = IssueString(keys['issue_number']).asString() # we need, at minimum, a series and issue number if keys['series'] is None or keys['issue_number'] is None: self.log_msg("Not enough info for a search!") return [] self.log_msg("Going to search for:") self.log_msg("\tSeries: " + keys['series']) self.log_msg("\tIssue: " + keys['issue_number']) if keys['issue_count'] is not None: self.log_msg("\tCount: " + str(keys['issue_count'])) if keys['year'] is not None: self.log_msg("\tYear: " + str(keys['year'])) if keys['month'] is not None: self.log_msg("\tMonth: " + str(keys['month'])) #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist)) comicVine = ComicVineTalker() comicVine.wait_for_rate_limit = self.waitAndRetryOnRateLimit comicVine.setLogFunc(self.output_function) # self.log_msg(("Searching for " + keys['series'] + "...") self.log_msg(u"Searching for {0} #{1} ...".format( keys['series'], keys['issue_number'])) try: cv_search_results = comicVine.searchForSeries(keys['series']) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series. Aborting...") return [] #self.log_msg("Found " + str(len(cv_search_results)) + " initial results") if self.cancel: return [] if cv_search_results is None: return [] series_second_round_list = [] #self.log_msg("Removing results with too long names, banned publishers, or future start dates") for item in cv_search_results: length_approved = False publisher_approved = True date_approved = True # remove any series that starts after the issue year if keys['year'] is not None and str( keys['year']).isdigit() and item['start_year'] is not None and str( item['start_year']).isdigit(): if int(keys['year']) < int(item['start_year']): date_approved = False # assume that our search name is close to the actual name, say # within ,e.g. 5 chars shortened_key = utils.removearticles(keys['series']) shortened_item_name = utils.removearticles(item['name']) if len(shortened_item_name) < ( len(shortened_key) + self.length_delta_thresh): length_approved = True # remove any series from publishers on the blacklist if item['publisher'] is not None: publisher = item['publisher']['name'] if publisher is not None and publisher.lower( ) in self.publisher_blacklist: publisher_approved = False if length_approved and publisher_approved and date_approved: series_second_round_list.append(item) self.log_msg( "Searching in " + str(len(series_second_round_list)) + " series") if self.callback is not None: self.callback(0, len(series_second_round_list)) # now sort the list by name length series_second_round_list.sort( key=lambda x: len(x['name']), reverse=False) # build a list of volume IDs volume_id_list = list() for series in series_second_round_list: volume_id_list.append(series['id']) try: issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear( volume_id_list, keys['issue_number'], keys['year']) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series details. Aborting...") return [] if issue_list is None: return [] shortlist = list() # now re-associate the issues and volumes for issue in issue_list: for series in series_second_round_list: if series['id'] == issue['volume']['id']: shortlist.append((series, issue)) break if keys['year'] is None: self.log_msg(u"Found {0} series that have an issue #{1}".format( len(shortlist), keys['issue_number'])) else: self.log_msg( u"Found {0} series that have an issue #{1} from {2}".format( len(shortlist), keys['issue_number'], keys['year'])) # now we have a shortlist of volumes with the desired issue number # Do first round of cover matching counter = len(shortlist) for series, issue in shortlist: if self.callback is not None: self.callback(counter, len(shortlist) * 3) counter += 1 self.log_msg(u"Examining covers for ID: {0} {1} ({2}) ...".format( series['id'], series['name'], series['start_year']), newline=False) # parse out the cover date day, month, year = comicVine.parseDateStr(issue['cover_date']) # Now check the cover match against the primary image hash_list = [cover_hash] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) try: image_url = issue['image']['super_url'] thumb_url = issue['image']['thumb_url'] page_url = issue['site_detail_url'] score_item = self.getIssueCoverMatchScore( comicVine, issue['id'], image_url, thumb_url, page_url, hash_list, useRemoteAlternates=False) except: self.match_list = [] return self.match_list match = dict() match['series'] = u"{0} ({1})".format( series['name'], series['start_year']) match['distance'] = score_item['score'] match['issue_number'] = keys['issue_number'] match['cv_issue_count'] = series['count_of_issues'] match['url_image_hash'] = score_item['hash'] match['issue_title'] = issue['name'] match['issue_id'] = issue['id'] match['volume_id'] = series['id'] match['month'] = month match['year'] = year match['publisher'] = None if series['publisher'] is not None: match['publisher'] = series['publisher']['name'] match['image_url'] = image_url match['thumb_url'] = thumb_url match['page_url'] = page_url match['description'] = issue['description'] self.match_list.append(match) self.log_msg(" --> {0}".format(match['distance']), newline=False) self.log_msg("") if len(self.match_list) == 0: self.log_msg(":-(no matches!") self.search_result = self.ResultNoMatches return self.match_list # sort list by image match scores self.match_list.sort(key=lambda k: k['distance']) l = [] for i in self.match_list: l.append(i['distance']) self.log_msg("Compared to covers in {0} issue(s):".format( len(self.match_list)), newline=False) self.log_msg(str(l)) def print_match(item): self.log_msg(u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format( item['series'], item['issue_number'], item['issue_title'], item['month'], item['year'], item['distance'])) best_score = self.match_list[0]['distance'] if best_score >= self.min_score_thresh: # we have 1 or more low-confidence matches (all bad cover scores) # look at a few more pages in the archive, and also alternate # covers online self.log_msg( "Very weak scores for the cover. Analyzing alternate pages and covers...") hash_list = [cover_hash] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) for i in range(1, min(3, ca.getNumberOfPages())): image_data = ca.getPage(i) page_hash = self.calculateHash(image_data) hash_list.append(page_hash) second_match_list = [] counter = 2 * len(self.match_list) for m in self.match_list: if self.callback is not None: self.callback(counter, len(self.match_list) * 3) counter += 1 self.log_msg( u"Examining alternate covers for ID: {0} {1} ...".format( m['volume_id'], m['series']), newline=False) try: score_item = self.getIssueCoverMatchScore( comicVine, m['issue_id'], m['image_url'], m['thumb_url'], m['page_url'], hash_list, useRemoteAlternates=True) except: self.match_list = [] return self.match_list self.log_msg("--->{0}".format(score_item['score'])) self.log_msg("") if score_item['score'] < self.min_alternate_score_thresh: second_match_list.append(m) m['distance'] = score_item['score'] if len(second_match_list) == 0: if len(self.match_list) == 1: self.log_msg("No matching pages in the issue.") self.log_msg( u"--------------------------------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultFoundMatchButBadCoverScore else: self.log_msg( u"--------------------------------------------------------------------------") self.log_msg( u"Multiple bad cover matches! Need to use other info...") self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultMultipleMatchesWithBadImageScores return self.match_list else: # We did good, found something! self.log_msg("Success in secondary/alternate cover matching!") self.match_list = second_match_list # sort new list by image match scores self.match_list.sort(key=lambda k: k['distance']) best_score = self.match_list[0]['distance'] self.log_msg( "[Second round cover matching: best score = {0}]".format(best_score)) # now drop down into the rest of the processing if self.callback is not None: self.callback(99, 100) # now pare down list, remove any item more than specified distant from # the top scores for item in reversed(self.match_list): if item['distance'] > best_score + self.min_score_distance: self.match_list.remove(item) # One more test for the case choosing limited series first issue vs a trade with the same cover: # if we have a given issue count > 1 and the volume from CV has # count==1, remove it from match list if len(self.match_list) >= 2 and keys[ 'issue_count'] is not None and keys['issue_count'] != 1: new_list = list() for match in self.match_list: if match['cv_issue_count'] != 1: new_list.append(match) else: self.log_msg( "Removing volume {0} [{1}] from consideration (only 1 issue)".format( match['series'], match['volume_id'])) if len(new_list) > 0: self.match_list = new_list if len(self.match_list) == 1: self.log_msg( u"--------------------------------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultOneGoodMatch elif len(self.match_list) == 0: self.log_msg( u"--------------------------------------------------------------------------") self.log_msg("No matches found :(") self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultNoMatches else: # we've got multiple good matches: self.log_msg("More than one likely candidate.") self.search_result = self.ResultMultipleGoodMatches self.log_msg( u"--------------------------------------------------------------------------") for item in self.match_list: print_match(item) self.log_msg( u"--------------------------------------------------------------------------") return self.match_list
def searchForSeries( self, series_name , callback=None, refresh_cache=False ): # remove cruft from the search string series_name = utils.removearticles( series_name ).lower().strip() # before we search online, look in our cache, since we might have # done this same search recently cvc = ComicVineCacher( ) if not refresh_cache: cached_search_results = cvc.get_search_results( series_name ) if len (cached_search_results) > 0: return cached_search_results original_series_name = series_name series_name = urllib.quote_plus(series_name.encode("utf-8")) #series_name = urllib.quote_plus(unicode(series_name)) search_url = self.api_base_url + "/search/?api_key=" + self.api_key + "&format=json&resources=volume&query=" + series_name + "&field_list=name,id,start_year,publisher,image,description,count_of_issues" content = self.getUrlContent(search_url + "&page=1") cv_response = json.loads(content) if cv_response[ 'status_code' ] != 1: self.writeLog( "Comic Vine query failed with error: [{0}]. \n".format( cv_response[ 'error' ] )) return None search_results = list() # see http://api.comicvine.com/documentation/#handling_responses limit = cv_response['limit'] current_result_count = cv_response['number_of_page_results'] total_result_count = cv_response['number_of_total_results'] if callback is None: self.writeLog( "Found {0} of {1} results\n".format( cv_response['number_of_page_results'], cv_response['number_of_total_results'])) search_results.extend( cv_response['results']) page = 1 if callback is not None: callback( current_result_count, total_result_count ) # see if we need to keep asking for more pages... while ( current_result_count < total_result_count ): if callback is None: self.writeLog("getting another page of results {0} of {1}...\n".format( current_result_count, total_result_count)) page += 1 content = self.getUrlContent(search_url + "&page="+str(page)) cv_response = json.loads(content) if cv_response[ 'status_code' ] != 1: self.writeLog( "Comic Vine query failed with error: [{0}]. \n".format( cv_response[ 'error' ] )) return None search_results.extend( cv_response['results']) current_result_count += cv_response['number_of_page_results'] if callback is not None: callback( current_result_count, total_result_count ) #for record in search_results: # #print( u"{0}: {1} ({2})".format(record['id'], record['name'] , record['start_year'] ) ) # #print record # #record['count_of_issues'] = record['count_of_isssues'] #print u"{0}: {1} ({2})".format(search_results['results'][0]['id'], search_results['results'][0]['name'] , search_results['results'][0]['start_year'] ) # cache these search results cvc.add_search_results( original_series_name, search_results ) return search_results