Ejemplo n.º 1
0
def delete_404_links():
    logger = delete_404_links.log
    n_days_ago = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(
        days=60)
    download_results = DownloadSourceResult.search({
        'last_check__lt': n_days_ago,
        'deleted': False
    })
    logger(f'{len(download_results)} download results need to be checked')
    download_sources_map = {
        ds.source_name: ds
        for ds in get_all_download_sources() if ds.enabled
    }

    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = []
        for download_result in download_results:
            future = executor.submit(_check_download_result_existence,
                                     download_result, download_sources_map,
                                     logger)
            future.log_msg = f'Checking download result {download_result.name}'
            futures.append(future)
        # wait until completed
        for future in concurrent.futures.as_completed(futures):
            logger(future.log_msg)
            future.result(timeout=600)
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        page = 1
        has_next_page = True
        page_size = 500
        while has_next_page is not None:
            paginator = DownloadSourceResult.search({'deleted': False},
                                                    paginate=True,
                                                    page_size=page_size,
                                                    page=page)
            total_pages = paginator.get('total_pages')
            print(
                f'Checking downloads: {(page - 1) * page_size}/{page * page_size} / Page: {page}/{total_pages}'
            )
            results = paginator.get('results')
            for ds in results:
                if ds.audiovisual_record is None:
                    continue

                ar = ds.audiovisual_record
                people = ar.directors + ar.writers + ar.stars
                remove_first = [person['name'].lower() for person in people]

                new_lang = guess_language(ds.name, remove_first=remove_first)
                if new_lang != ds.lang:
                    ds.lang = new_lang
                    ds.save()
            has_next_page = paginator.get('next_page', None)
            page += 1
Ejemplo n.º 3
0
    def get_source_results(self, logger=None, sleep_between_requests=30):
        self._logger = logger

        response = self._get_http_response(sleep_between_requests)
        if response is None or response.content is None:
            raise DownloadSourceException('Response from session was None')

        if response.status_code == 404:
            return []

        html_dom = HTML(html=response.content)
        results = []
        for a in html_dom.find('a'):
            name = a.text
            if len(name) < 4:
                continue
            link = list(a.links)[0] if len(a.links) > 0 else ''
            if link == '':
                continue

            name_remover = RemoveAudiovisualRecordNameFromString(self._name)
            text_without_name = name_remover.replace_name_from_string(name)
            quality_detector = VideoQualityInStringDetector(text_without_name)

            source_name = self.source_name
            name = name.strip()
            quality = quality_detector.quality
            if not link.lower().startswith('http'):
                link = self.base_url + link

            language = guess_language(name, default=self.language, remove_first=self._remove_first)

            result = DownloadSourceResult(
                source_name=source_name,
                name=name,
                link=link,
                quality=quality,
                lang=language,
                audiovisual_record=None
            )

            valid_result, ratio = self._valid_result(result)
            if ratio < 0.8:
                # self.log(f'--- Not valid result {name} {link}. Dropping it. {ratio}')
                pass
            else:
                self.log(f'??? Possible valid result {name} {link}. Ratio: {ratio}')
                results.append(result)

        return self.post_process_results(results)
Ejemplo n.º 4
0
def remove_download(request, object_id):
    if not request.user.is_superuser:
        return HttpResponse(status=403)
    _id = ObjectId(object_id)
    try:
        download = DownloadSourceResult.search({'_id': _id})[0]
        download.delete()
        download.audiovisual_record.metadata['recheck_downloads'] = True
        download.audiovisual_record.save()
    except IndexError:
        pass
    finally:
        try:
            referer = request.META['HTTP_REFERER']
            return redirect(referer)
        except IndexError:
            return redirect('/')
Ejemplo n.º 5
0
 def handle(self, *args, **options):
     page = 1
     has_next = True
     page_size = 500
     while has_next:
         paginator = DownloadSourceResult.search({'deleted': False},
                                                 paginate=True,
                                                 page_size=page_size,
                                                 page=page)
         total_pages = paginator.get('total_pages')
         print(
             f'Checking downloads: {(page - 1) * page_size}/{page * page_size} / Page: {page}/{total_pages}'
         )
         for ds in paginator.get('results'):
             if ds.audiovisual_record is None:
                 continue
             qd = VideoQualityInStringDetector(ds.name)
             if qd.quality != ds.quality:
                 print(f'Processing {ds}')
                 ds.quality = qd.quality
                 ds.save()
         has_next = paginator.get('next_page', False)
         page += 1
Ejemplo n.º 6
0
def details(request, slug=None):
    try:
        referer_uri = request.META['HTTP_REFERER']
        referer_uri = urllib.parse.unquote(referer_uri)
        get_params = {
            p.split('=')[0]: p.split('=')[1]
            for p in referer_uri.split('?')[1].split('&')
        }
    except (IndexError, KeyError):
        get_params = {}

    audiovisual_records = AudiovisualRecord.search({
        'deleted':
        False,
        'has_downloads':
        True,
        'general_information_fetched':
        True,
        'slug':
        slug
    })
    if len(audiovisual_records) == 0:
        context = {'genres_names': _get_genres()}
        return render(request, 'web/404.html', status=404, context=context)

    audiovisual_record = audiovisual_records[0]

    for score in audiovisual_record.scores:
        source = get_general_information_source_by_name(
            score.get('source_name'))
        score['external_url'] = source.base_url + audiovisual_record.metadata[
            'detailed_page'][source.source_name]

    # Add to each person the search url to be used later in the template
    for person in audiovisual_record.directors + audiovisual_record.writers + audiovisual_record.stars:
        person.search_url = f'/s/?ft=b&s="{person.name}"'.replace(' ', '+')

    # related audiovisual records
    # TODO esto toca un poco los huevos
    related_records = AudiovisualRecord.search(
        {
            'deleted':
            False,
            'has_downloads':
            True,
            'general_information_fetched':
            True,
            'name__neq':
            audiovisual_record.name,
            'stars__name__in':
            [person.name for person in audiovisual_record.stars],
        },
        page_size=10,
        page=1,
        paginate=True,
        sort_by=['-global_score']).get('results')
    # more = AudiovisualRecord.search(
    #     {
    #         'deleted': False, 'has_downloads': True, 'general_information_fetched': True,
    #         'name__neq': audiovisual_record.name,
    #         'name__simil': audiovisual_record.name,
    #         '_id__nin': [r.id for r in related_records]
    #     },
    #     page_size=10, page=1, paginate=True, sort_by=['-global_score']
    # ).get('results')

    related_records = related_records  # + more

    # downloads
    # TODO esto toca mucho los huevos
    downloads = DownloadSourceResult.search(
        {
            'audiovisual_record': audiovisual_record,
            'deleted': False
        },
        sort_by='quality')

    lang_translations = {
        'eng': 'English',
        'rus': 'Russian',
        'spa': 'Spanish',
        'hin': 'Hindi',
        'deu': 'German',
        'ita': 'Italian',
        'jpn': 'Japanese',
        'fra': 'French',
        'kor': 'Korean',
        'gre': 'Greek',
        'pol': 'Polish',
    }
    names_used = []
    lang_downloads = []
    for lang in [
            'eng', 'rus', 'spa', 'deu', 'fra', 'ita', 'gre', 'pol', 'hin',
            'jpn', 'kor'
    ]:
        ds = []
        for d in downloads:
            if d.lang == lang and d.name not in names_used:
                names_used.append(d.name)
                ds.append(d)
        ds = ds[:10]
        if len(ds) > 0:
            lang_downloads.append((lang, ds, lang_translations[lang]))

    context = {
        'context_class':
        'details',
        'is_landing':
        True,
        'audiovisual_record':
        audiovisual_record,
        'downloads':
        downloads,
        'lang_downloads':
        lang_downloads,
        'filter_params':
        get_params,
        'genres_names':
        _get_genres(),
        'qualities':
        VideoQualityInStringDetector.our_qualities,
        'related_records':
        related_records,
        'year_range': [
            str(y) for y in range(1970,
                                  int(datetime.utcnow().strftime('%Y')) + 1)
        ]
    }
    return render(request, 'web/details.html', context=context)
Ejemplo n.º 7
0
def _worker_get_download_links(source_class, audiovisual_record, logger):
    source = source_class(audiovisual_record.name,
                          year=audiovisual_record.year)

    try:
        logger(f'get downloads links for {audiovisual_record.name}')
        results = source.get_source_results(logger=logger,
                                            sleep_between_requests=60)
        logger(f'{len(results)} for {audiovisual_record.name}')

    except DownloadSourceException as e:
        log_exception(e)
        # TODO maybe increase an error counter?

    except PhantomBrowsingSession.DomainError as e:
        # domain cannot be resolved to IP address
        # disable the source
        configuration = get_download_source_configuration(source_class)
        configuration.data['enabled'] = False
        configuration.save()
        log_exception(e)

    except PhantomBrowsingSession.RemoteServerError as e:
        # cannot connect to ports 80 / 443
        log_exception(e)
        # TODO maybe increase an error counter?

    else:
        if len(results) == 0:
            resp = source._last_response
            if resp is not None:
                response_filename = _get_response_filename(
                    audiovisual_record.name, source_class.source_name)
                with open(response_filename, 'wb') as f:
                    f.write(resp.content)

        # this check for a lot of zero results. If is reached a number, disable de source
        _check_zero_results(results, source_class, audiovisual_record, logger)

        for result in results:
            if result.quality == 'Audio':
                continue

            # if link exists do nothing
            relative_url = urlparse(result.link).path
            exists = DownloadSourceResult.search({
                'source_name':
                source_class.source_name,
                'link__icontains':
                relative_url
            })
            exists += DownloadSourceResult.search({
                'source_name': source_class.source_name,
                'name': result.name
            })

            if len(exists) > 0:
                continue

            result.audiovisual_record = audiovisual_record
            result.save()
            logger(f'+++ Valid result {result.name} {result.link}.')

        audiovisual_record.refresh()
        if 'downloads_fetch' not in audiovisual_record.metadata:
            audiovisual_record.metadata['downloads_fetch'] = {}
        audiovisual_record.metadata['downloads_fetch'][
            source_class.source_name] = True
        audiovisual_record.save()
        logger(
            f'Marked {audiovisual_record.name} as reviewed for source {source_class.source_name}'
        )
Ejemplo n.º 8
0
def _check_zero_results(results, source_class, audiovisual_record, logger):
    configuration = get_download_source_configuration(source_class)

    if len(results) == 0:
        if audiovisual_record.id not in configuration.data['audiovisual_ids']:
            configuration.data['zero_results_searches'] += 1
            configuration.data['audiovisual_names'].append(
                audiovisual_record.name)
            configuration.data['audiovisual_ids'].append(audiovisual_record.id)
    else:
        configuration.data['zero_results_searches'] = 0
        configuration.data['audiovisual_names'] = []
        configuration.data['audiovisual_ids'] = []
    configuration.save()

    # if there is a lot of results with 0 length we do this additional check
    # get last good downloads for this source, get the film and try again
    # if results now are zero, disable the source.
    # because the html structure of the web maybe changed
    if configuration.data['zero_results_searches'] > 300:

        previous_good_search = DownloadSourceResult.search(
            {
                'deleted': False,
                'source_name': source_class.source_name
            },
            paginate=True,
            page_size=1,
            page=1)

        if len(previous_good_search) > 0:
            previous_good_search = previous_good_search[0]
            previous_audiovisual_record = previous_good_search.audiovisual_record
            ds = source_class(previous_audiovisual_record.name,
                              year=previous_audiovisual_record.year)

            ar = previous_audiovisual_record
            people = ar.directors + ar.writers + ar.stars
            remove_first = [person.name.lower() for person in people]
            results_check = ds.get_source_results(logger=logger,
                                                  remove_first=remove_first)

            if len(results_check) == 0:
                configuration.refresh()
                configuration.data['enabled'] = False
                configuration.save()
            else:
                configuration.refresh()
                configuration.data['enabled'] = True
                configuration.data['zero_results_searches'] = 0
                configuration.data['audiovisual_names'] = []
                configuration.data['audiovisual_ids'] = []
                configuration.save()
        else:
            configuration.refresh()
            configuration.data['enabled'] = False
            configuration.save()

    if not configuration.data['enabled']:
        raise DownloadSourceException(
            f'Disabled {source_class.source_name} download source.')