def search(self,is_info=None): self.cookie_receipt() results = {} try: reversed_order = True total_pages = 1 pagenumber = 1 if is_info is not None: if is_info['chktpb'] == 0: logger.debug('removing query from loop that accounts for no issue number') else: self.search_format.insert(0, self.query['comicname']) logger.debug('setting no issue number query to be first due to no issue number') if mylar.CONFIG.PACK_PRIORITY: #t_sf = self.search_format.pop(len(self.search_format)-1) #pop the last search query ('%s %s') #add it in 1st so that packs will get searched for (hopefully first) self.search_format.insert(0, '%s %s' % (self.query['comicname'], self.query['year'])) for sf in self.search_format: resultset = [] verified_matches = [] sf_issue = self.query['issue'] if is_info['chktpb'] == 1 and self.query['comicname'] == sf: comicname = re.sub(r'[\&\:\?\,\/\-]', '', self.query['comicname']) comicname = re.sub("\\band\\b", '', comicname, flags=re.I) comicname = re.sub("\\bthe\\b", '', comicname, flags=re.I) queryline = re.sub(r'\s+', ' ', comicname) else: if any([self.query['issue'] == 'None', self.query['issue'] is None]): sf_issue = None if sf.count('%s') == 3: if sf == self.search_format[1]: #don't modify the specific query that is around quotation marks. if any([r'/' in self.query['comicname'], r':' in self.query['comicname']]): self.query['comicname'] = re.sub(r'[/|:]', ' ', self.query['comicname']) self.query['comicname'] = re.sub(r'\s+', ' ', self.query['comicname']) if sf_issue is None: splits = sf.split(' ') splits.pop(1) queryline = ' '.join(splits) % (self.query['comicname'], self.query['year']) else: queryline = sf % (self.query['comicname'], sf_issue, self.query['year']) else: #logger.fdebug('[%s] self.search_format: %s' % (len(self.search_format), sf)) if len(self.search_format) == 5 and sf == self.search_format[4]: splits = sf.split(' ') splits.pop(1) queryline = ' '.join(splits) % (self.query['comicname']) else: sf_count = len([m.start() for m in re.finditer('(?=%s)', sf)]) if sf_count == 0: # this is the injected search format above that's already replaced values queryline = sf elif sf_count == 2: queryline = sf % (self.query['comicname'], sf_issue) elif sf_count == 3: queryline = sf % (self.query['comicname'], sf_issue, self.query['year']) else: queryline = sf % (self.query['comicname']) logger.fdebug('[DDL-QUERY] Query set to: %s' % queryline) pause_the_search = mylar.CONFIG.DDL_QUERY_DELAY #mylar.search.check_the_search_delay() diff = mylar.search.check_time(self.provider_stat['lastrun']) # only limit the search queries - the other calls should be direct and not as intensive if diff < pause_the_search: logger.warn('[PROVIDER-SEARCH-DELAY][DDL] Waiting %s seconds before we search again...' % (pause_the_search - int(diff))) time.sleep(pause_the_search - int(diff)) else: logger.fdebug('[PROVIDER-SEARCH-DELAY][DDL] Last search took place %s seconds ago. We\'re clear...' % (int(diff))) if queryline: gc_url = self.url if pagenumber != 1 and pagenumber != total_pages: gc_url = '%s/page/%s' % (self.url, pagenumber) logger.fdebug('parsing for page %s' % pagenumber) #logger.fdebug('session cookies: %s' % (self.session.cookies,)) t = self.session.get( gc_url + '/', params={'s': queryline}, verify=True, headers=self.headers, stream=True, timeout=30, ) write_time = time.time() mylar.search.last_run_check(write={'DDL(GetComics)': {'id': 200, 'active': True, 'lastrun': write_time, 'type': 'DDL', 'hits': self.provider_stat['hits']+1}}) self.provider_stat['lastrun'] = write_time with open(self.local_filename, 'wb') as f: for chunk in t.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() for x in self.search_results(pagenumber,total_pages)['entries']: if total_pages != 1: total_pages = x['total_pages'] if pagenumber != 1: pagenumber = x['page'] bb = next((item for item in resultset if item['link'] == x['link']), None) try: if 'Weekly' not in self.query['comicname'] and 'Weekly' in x['title']: continue elif bb is None: resultset.append(x) except Exception as e: resultset.append(x) else: continue logger.info('resultset: %s' % (resultset,)) if len(resultset) >= 1: results['entries'] = resultset sfs = search_filer.search_check() verified_matches = sfs.checker(results, is_info) if verified_matches: logger.fdebug('verified_matches: %s' % (verified_matches,)) break logger.fdebug('sleep...%s%s' % (mylar.CONFIG.DDL_QUERY_DELAY, 's')) time.sleep(mylar.CONFIG.DDL_QUERY_DELAY) except requests.exceptions.Timeout as e: logger.warn( 'Timeout occured fetching data from DDL: %s' % e ) return 'no results' except requests.exceptions.ConnectionError as e: logger.warn( '[WARNING] Connection refused to DDL site, stopped by a small tank.' ' Error returned as : %s' % e ) if any( [ errno.ETIMEDOUT, errno.ECONNREFUSED, errno.EHOSTDOWN, errno.EHOSTUNREACH, ] ): helpers.disable_provider('DDL', 'Connection Refused.') return 'no results' except Exception as err: logger.warn( '[WARNING] Unable to scrape remote site, stopped by a small tank.' ' Error returned as : %s' % err ) if 'Unable to identify Cloudflare IUAM' in str(err): helpers.disable_provider( 'DDL', 'Unable to identify Cloudflare IUAM Javascript on website' ) # since we're capturing exceptions here, searches from the search module # won't get capture. So we need to do this so they get tracked. exc_type, exc_value, exc_tb = sys.exc_info() filename, line_num, func_name, err_text = traceback.extract_tb( exc_tb )[-1] tracebackline = traceback.format_exc() except_line = { 'exc_type': exc_type, 'exc_value': exc_value, 'exc_tb': exc_tb, 'filename': filename, 'line_num': line_num, 'func_name': func_name, 'err': str(err), 'err_text': err_text, 'traceback': tracebackline, 'comicname': None, 'issuenumber': None, 'seriesyear': None, 'issueid': self.issueid, 'comicid': self.comicid, 'mode': None, 'booktype': None, } helpers.log_that_exception(except_line) return 'no results' else: if mylar.CONFIG.PACK_PRIORITY is True: #logger.fdebug('[PACK_PRIORITY:True] %s' % (sorted(verified_matches, key=itemgetter('pack'), reverse=True))) return sorted(verified_matches, key=itemgetter('pack'), reverse=True) else: #logger.fdebug('[PACK_PRIORITY:False] %s' % (sorted(verified_matches, key=itemgetter('pack'), reverse=False))) return sorted(verified_matches, key=itemgetter('pack'), reverse=False)
def search(self): try: with cfscrape.create_scraper() as s: cf_cookievalue, cf_user_agent = s.get_tokens( self.url, headers=self.headers) t = s.get( self.url + '/', params={'s': self.query}, verify=True, cookies=cf_cookievalue, headers=self.headers, stream=True, timeout=30, ) with open(self.local_filename, 'wb') as f: for chunk in t.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() except requests.exceptions.Timeout as e: logger.warn('Timeout occured fetching data from DDL: %s' % e) return 'no results' except requests.exceptions.ConnectionError as e: logger.warn( '[WARNING] Connection refused to DDL site, stopped by a small tank.' ' Error returned as : %s' % e) if any([ errno.ETIMEDOUT, errno.ECONNREFUSED, errno.EHOSTDOWN, errno.EHOSTUNREACH, ]): helpers.disable_provider('DDL', 'Connection Refused.') return 'no results' except Exception as err: logger.warn( '[WARNING] Unable to scrape remote site, stopped by a small tank.' ' Error returned as : %s' % err) if 'Unable to identify Cloudflare IUAM' in str(err): helpers.disable_provider( 'DDL', 'Unable to identify Cloudflare IUAM Javascript on website') # since we're capturing exceptions here, searches from the search module # won't get capture. So we need to do this so they get tracked. exc_type, exc_value, exc_tb = sys.exc_info() filename, line_num, func_name, err_text = traceback.extract_tb( exc_tb)[-1] tracebackline = traceback.format_exc() except_line = { 'exc_type': exc_type, 'exc_value': exc_value, 'exc_tb': exc_tb, 'filename': filename, 'line_num': line_num, 'func_name': func_name, 'err': str(err), 'err_text': err_text, 'traceback': tracebackline, 'comicname': None, 'issuenumber': None, 'seriesyear': None, 'issueid': self.issueid, 'comicid': self.comicid, 'mode': None, 'booktype': None, } helpers.log_that_exception(except_line) return 'no results' else: return self.search_results()
def search(self): results = {} resultset = [] try: with cfscrape.create_scraper() as s: cf_cookievalue, cf_user_agent = s.get_tokens( self.url, headers=self.headers) for sf in self.search_format: sf_issue = self.query['issue'] if any([ self.query['issue'] == 'None', self.query['issue'] is None ]): sf_issue = None if sf.count('%s') == 3: if sf == self.search_format[1]: #don't modify the specific query that is around quotation marks. if any([ r'/' in self.query['comicname'], r':' in self.query['comicname'] ]): self.query['comicname'] = re.sub( r'[/|:]', ' ', self.query['comicname']) self.query['comicname'] = re.sub( r'\s+', ' ', self.query['comicname']) if sf_issue is None: splits = sf.split(' ') splits.pop(1) queryline = ' '.join(splits) % ( self.query['comicname'], self.query['year']) else: queryline = sf % (self.query['comicname'], sf_issue, self.query['year']) else: if sf_issue is None: splits = sf.split(' ') splits.pop(1) queryline = ' '.join(splits) % ( self.query['comicname']) else: queryline = sf % (self.query['comicname'], sf_issue) logger.fdebug('[DDL-QUERY] Query set to: %s' % queryline) t = s.get( self.url + '/', params={'s': queryline}, verify=True, cookies=cf_cookievalue, headers=self.headers, stream=True, timeout=30, ) with open(self.local_filename, 'wb') as f: for chunk in t.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() for x in self.search_results()['entries']: bb = next((item for item in resultset if item['link'] == x['link']), None) try: if 'Weekly' not in self.query[ 'comicname'] and 'Weekly' in x['title']: continue elif bb is None: resultset.append(x) except: resultset.append(x) else: continue if len(resultset) > 1: break time.sleep(2) except requests.exceptions.Timeout as e: logger.warn('Timeout occured fetching data from DDL: %s' % e) return 'no results' except requests.exceptions.ConnectionError as e: logger.warn( '[WARNING] Connection refused to DDL site, stopped by a small tank.' ' Error returned as : %s' % e) if any([ errno.ETIMEDOUT, errno.ECONNREFUSED, errno.EHOSTDOWN, errno.EHOSTUNREACH, ]): helpers.disable_provider('DDL', 'Connection Refused.') return 'no results' except Exception as err: logger.warn( '[WARNING] Unable to scrape remote site, stopped by a small tank.' ' Error returned as : %s' % err) if 'Unable to identify Cloudflare IUAM' in str(err): helpers.disable_provider( 'DDL', 'Unable to identify Cloudflare IUAM Javascript on website') # since we're capturing exceptions here, searches from the search module # won't get capture. So we need to do this so they get tracked. exc_type, exc_value, exc_tb = sys.exc_info() filename, line_num, func_name, err_text = traceback.extract_tb( exc_tb)[-1] tracebackline = traceback.format_exc() except_line = { 'exc_type': exc_type, 'exc_value': exc_value, 'exc_tb': exc_tb, 'filename': filename, 'line_num': line_num, 'func_name': func_name, 'err': str(err), 'err_text': err_text, 'traceback': tracebackline, 'comicname': None, 'issuenumber': None, 'seriesyear': None, 'issueid': self.issueid, 'comicid': self.comicid, 'mode': None, 'booktype': None, } helpers.log_that_exception(except_line) return 'no results' else: results['entries'] = resultset return results