Example #1
0
    def search(self,is_info=None):

        self.cookie_receipt()

        results = {}
        try:
            reversed_order = True
            total_pages = 1
            pagenumber = 1
            if is_info is not None:
                if is_info['chktpb'] == 0:
                    logger.debug('removing query from loop that accounts for no issue number')
                else:
                    self.search_format.insert(0, self.query['comicname'])
                    logger.debug('setting no issue number query to be first due to no issue number')

            if mylar.CONFIG.PACK_PRIORITY:
                #t_sf = self.search_format.pop(len(self.search_format)-1) #pop the last search query ('%s %s')
                #add it in 1st so that packs will get searched for (hopefully first)
                self.search_format.insert(0, '%s %s' % (self.query['comicname'], self.query['year']))

            for sf in self.search_format:
                resultset = []
                verified_matches = []
                sf_issue = self.query['issue']
                if is_info['chktpb'] == 1 and self.query['comicname'] == sf:
                    comicname = re.sub(r'[\&\:\?\,\/\-]', '', self.query['comicname'])
                    comicname = re.sub("\\band\\b", '', comicname, flags=re.I)
                    comicname = re.sub("\\bthe\\b", '', comicname, flags=re.I)
                    queryline = re.sub(r'\s+', ' ', comicname)
                else:
                    if any([self.query['issue'] == 'None', self.query['issue'] is None]):
                        sf_issue = None
                    if sf.count('%s') == 3:
                        if sf == self.search_format[1]:
                            #don't modify the specific query that is around quotation marks.
                            if any([r'/' in self.query['comicname'], r':' in self.query['comicname']]):
                                self.query['comicname'] = re.sub(r'[/|:]', ' ', self.query['comicname'])
                                self.query['comicname'] = re.sub(r'\s+', ' ', self.query['comicname'])
                        if sf_issue is None:
                            splits = sf.split(' ')
                            splits.pop(1)
                            queryline = ' '.join(splits) % (self.query['comicname'], self.query['year'])
                        else:
                            queryline = sf % (self.query['comicname'], sf_issue, self.query['year'])
                    else:
                        #logger.fdebug('[%s] self.search_format: %s' % (len(self.search_format), sf))
                        if len(self.search_format) == 5 and sf == self.search_format[4]:
                            splits = sf.split(' ')
                            splits.pop(1)
                            queryline = ' '.join(splits) % (self.query['comicname'])
                        else:
                            sf_count = len([m.start() for m in re.finditer('(?=%s)', sf)])
                            if sf_count == 0:
                                # this is the injected search format above that's already replaced values
                                queryline = sf
                            elif sf_count == 2:
                                queryline = sf % (self.query['comicname'], sf_issue)
                            elif sf_count == 3:
                                queryline = sf % (self.query['comicname'], sf_issue, self.query['year'])
                            else:
                                queryline = sf % (self.query['comicname'])

                logger.fdebug('[DDL-QUERY] Query set to: %s' % queryline)
                pause_the_search = mylar.CONFIG.DDL_QUERY_DELAY #mylar.search.check_the_search_delay()
                diff = mylar.search.check_time(self.provider_stat['lastrun']) # only limit the search queries - the other calls should be direct and not as intensive
                if diff < pause_the_search:
                    logger.warn('[PROVIDER-SEARCH-DELAY][DDL] Waiting %s seconds before we search again...' % (pause_the_search - int(diff)))
                    time.sleep(pause_the_search - int(diff))
                else:
                    logger.fdebug('[PROVIDER-SEARCH-DELAY][DDL] Last search took place %s seconds ago. We\'re clear...' % (int(diff)))

                if queryline:
                    gc_url = self.url
                    if pagenumber != 1 and pagenumber != total_pages:
                        gc_url = '%s/page/%s' % (self.url, pagenumber)
                        logger.fdebug('parsing for page %s' % pagenumber)
                    #logger.fdebug('session cookies: %s' % (self.session.cookies,))
                    t = self.session.get(
                        gc_url + '/',
                        params={'s': queryline},
                        verify=True,
                        headers=self.headers,
                        stream=True,
                        timeout=30,
                    )

                    write_time = time.time()
                    mylar.search.last_run_check(write={'DDL(GetComics)': {'id': 200, 'active': True, 'lastrun': write_time, 'type': 'DDL', 'hits': self.provider_stat['hits']+1}})
                    self.provider_stat['lastrun'] = write_time

                    with open(self.local_filename, 'wb') as f:
                        for chunk in t.iter_content(chunk_size=1024):
                           if chunk:  # filter out keep-alive new chunks
                                f.write(chunk)
                                f.flush()

                for x in self.search_results(pagenumber,total_pages)['entries']:
                    if total_pages != 1:
                        total_pages = x['total_pages']
                    if pagenumber != 1:
                        pagenumber = x['page']
                    bb = next((item for item in resultset if item['link'] == x['link']), None)
                    try:
                        if 'Weekly' not in self.query['comicname'] and 'Weekly' in x['title']:
                            continue
                        elif bb is None:
                            resultset.append(x)
                    except Exception as e:
                        resultset.append(x)
                    else:
                        continue

                logger.info('resultset: %s' % (resultset,))
                if len(resultset) >= 1:
                    results['entries'] = resultset
                    sfs = search_filer.search_check()
                    verified_matches = sfs.checker(results, is_info)
                    if verified_matches:
                        logger.fdebug('verified_matches: %s' % (verified_matches,))
                        break
                logger.fdebug('sleep...%s%s' % (mylar.CONFIG.DDL_QUERY_DELAY, 's'))
                time.sleep(mylar.CONFIG.DDL_QUERY_DELAY)

        except requests.exceptions.Timeout as e:
            logger.warn(
                'Timeout occured fetching data from DDL: %s' % e
            )
            return 'no results'
        except requests.exceptions.ConnectionError as e:
            logger.warn(
                '[WARNING] Connection refused to DDL site, stopped by a small tank.'
                ' Error returned as : %s' % e
            )
            if any(
                [
                    errno.ETIMEDOUT,
                    errno.ECONNREFUSED,
                    errno.EHOSTDOWN,
                    errno.EHOSTUNREACH,
                ]
            ):
                helpers.disable_provider('DDL', 'Connection Refused.')
            return 'no results'
        except Exception as err:
            logger.warn(
                '[WARNING] Unable to scrape remote site, stopped by a small tank.'
                ' Error returned as : %s' % err
            )
            if 'Unable to identify Cloudflare IUAM' in str(err):
                helpers.disable_provider(
                    'DDL', 'Unable to identify Cloudflare IUAM Javascript on website'
                )

            # since we're capturing exceptions here, searches from the search module
            # won't get capture. So we need to do this so they get tracked.
            exc_type, exc_value, exc_tb = sys.exc_info()
            filename, line_num, func_name, err_text = traceback.extract_tb(
                exc_tb
            )[-1]
            tracebackline = traceback.format_exc()

            except_line = {
                'exc_type': exc_type,
                'exc_value': exc_value,
                'exc_tb': exc_tb,
                'filename': filename,
                'line_num': line_num,
                'func_name': func_name,
                'err': str(err),
                'err_text': err_text,
                'traceback': tracebackline,
                'comicname': None,
                'issuenumber': None,
                'seriesyear': None,
                'issueid': self.issueid,
                'comicid': self.comicid,
                'mode': None,
                'booktype': None,
            }

            helpers.log_that_exception(except_line)

            return 'no results'
        else:
            if mylar.CONFIG.PACK_PRIORITY is True:
                #logger.fdebug('[PACK_PRIORITY:True] %s' % (sorted(verified_matches, key=itemgetter('pack'), reverse=True)))
                return sorted(verified_matches, key=itemgetter('pack'), reverse=True)
            else:
                #logger.fdebug('[PACK_PRIORITY:False] %s' % (sorted(verified_matches, key=itemgetter('pack'), reverse=False)))
                return sorted(verified_matches, key=itemgetter('pack'), reverse=False)
Example #2
0
    def search(self):

        try:
            with cfscrape.create_scraper() as s:
                cf_cookievalue, cf_user_agent = s.get_tokens(
                    self.url, headers=self.headers)

            t = s.get(
                self.url + '/',
                params={'s': self.query},
                verify=True,
                cookies=cf_cookievalue,
                headers=self.headers,
                stream=True,
                timeout=30,
            )

            with open(self.local_filename, 'wb') as f:
                for chunk in t.iter_content(chunk_size=1024):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        f.flush()

        except requests.exceptions.Timeout as e:
            logger.warn('Timeout occured fetching data from DDL: %s' % e)
            return 'no results'
        except requests.exceptions.ConnectionError as e:
            logger.warn(
                '[WARNING] Connection refused to DDL site, stopped by a small tank.'
                ' Error returned as : %s' % e)
            if any([
                    errno.ETIMEDOUT,
                    errno.ECONNREFUSED,
                    errno.EHOSTDOWN,
                    errno.EHOSTUNREACH,
            ]):
                helpers.disable_provider('DDL', 'Connection Refused.')
            return 'no results'
        except Exception as err:
            logger.warn(
                '[WARNING] Unable to scrape remote site, stopped by a small tank.'
                ' Error returned as : %s' % err)
            if 'Unable to identify Cloudflare IUAM' in str(err):
                helpers.disable_provider(
                    'DDL',
                    'Unable to identify Cloudflare IUAM Javascript on website')

            # since we're capturing exceptions here, searches from the search module
            # won't get capture. So we need to do this so they get tracked.
            exc_type, exc_value, exc_tb = sys.exc_info()
            filename, line_num, func_name, err_text = traceback.extract_tb(
                exc_tb)[-1]
            tracebackline = traceback.format_exc()

            except_line = {
                'exc_type': exc_type,
                'exc_value': exc_value,
                'exc_tb': exc_tb,
                'filename': filename,
                'line_num': line_num,
                'func_name': func_name,
                'err': str(err),
                'err_text': err_text,
                'traceback': tracebackline,
                'comicname': None,
                'issuenumber': None,
                'seriesyear': None,
                'issueid': self.issueid,
                'comicid': self.comicid,
                'mode': None,
                'booktype': None,
            }

            helpers.log_that_exception(except_line)

            return 'no results'
        else:
            return self.search_results()
Example #3
0
    def search(self):
        results = {}
        resultset = []
        try:
            with cfscrape.create_scraper() as s:
                cf_cookievalue, cf_user_agent = s.get_tokens(
                    self.url, headers=self.headers)

                for sf in self.search_format:
                    sf_issue = self.query['issue']
                    if any([
                            self.query['issue'] == 'None',
                            self.query['issue'] is None
                    ]):
                        sf_issue = None
                    if sf.count('%s') == 3:
                        if sf == self.search_format[1]:
                            #don't modify the specific query that is around quotation marks.
                            if any([
                                    r'/' in self.query['comicname'], r':'
                                    in self.query['comicname']
                            ]):
                                self.query['comicname'] = re.sub(
                                    r'[/|:]', ' ', self.query['comicname'])
                                self.query['comicname'] = re.sub(
                                    r'\s+', ' ', self.query['comicname'])
                        if sf_issue is None:
                            splits = sf.split(' ')
                            splits.pop(1)
                            queryline = ' '.join(splits) % (
                                self.query['comicname'], self.query['year'])
                        else:
                            queryline = sf % (self.query['comicname'],
                                              sf_issue, self.query['year'])
                    else:
                        if sf_issue is None:
                            splits = sf.split(' ')
                            splits.pop(1)
                            queryline = ' '.join(splits) % (
                                self.query['comicname'])
                        else:
                            queryline = sf % (self.query['comicname'],
                                              sf_issue)

                    logger.fdebug('[DDL-QUERY] Query set to: %s' % queryline)

                    t = s.get(
                        self.url + '/',
                        params={'s': queryline},
                        verify=True,
                        cookies=cf_cookievalue,
                        headers=self.headers,
                        stream=True,
                        timeout=30,
                    )

                    with open(self.local_filename, 'wb') as f:
                        for chunk in t.iter_content(chunk_size=1024):
                            if chunk:  # filter out keep-alive new chunks
                                f.write(chunk)
                                f.flush()

                    for x in self.search_results()['entries']:
                        bb = next((item for item in resultset
                                   if item['link'] == x['link']), None)
                        try:
                            if 'Weekly' not in self.query[
                                    'comicname'] and 'Weekly' in x['title']:
                                continue
                            elif bb is None:
                                resultset.append(x)
                        except:
                            resultset.append(x)
                        else:
                            continue

                    if len(resultset) > 1:
                        break
                    time.sleep(2)

        except requests.exceptions.Timeout as e:
            logger.warn('Timeout occured fetching data from DDL: %s' % e)
            return 'no results'
        except requests.exceptions.ConnectionError as e:
            logger.warn(
                '[WARNING] Connection refused to DDL site, stopped by a small tank.'
                ' Error returned as : %s' % e)
            if any([
                    errno.ETIMEDOUT,
                    errno.ECONNREFUSED,
                    errno.EHOSTDOWN,
                    errno.EHOSTUNREACH,
            ]):
                helpers.disable_provider('DDL', 'Connection Refused.')
            return 'no results'
        except Exception as err:
            logger.warn(
                '[WARNING] Unable to scrape remote site, stopped by a small tank.'
                ' Error returned as : %s' % err)
            if 'Unable to identify Cloudflare IUAM' in str(err):
                helpers.disable_provider(
                    'DDL',
                    'Unable to identify Cloudflare IUAM Javascript on website')

            # since we're capturing exceptions here, searches from the search module
            # won't get capture. So we need to do this so they get tracked.
            exc_type, exc_value, exc_tb = sys.exc_info()
            filename, line_num, func_name, err_text = traceback.extract_tb(
                exc_tb)[-1]
            tracebackline = traceback.format_exc()

            except_line = {
                'exc_type': exc_type,
                'exc_value': exc_value,
                'exc_tb': exc_tb,
                'filename': filename,
                'line_num': line_num,
                'func_name': func_name,
                'err': str(err),
                'err_text': err_text,
                'traceback': tracebackline,
                'comicname': None,
                'issuenumber': None,
                'seriesyear': None,
                'issueid': self.issueid,
                'comicid': self.comicid,
                'mode': None,
                'booktype': None,
            }

            helpers.log_that_exception(except_line)

            return 'no results'
        else:
            results['entries'] = resultset
            return results