Ejemplo n.º 1
0
    def _index_files(self, decisions):
        for decision in decisions:
            url_list = decision.get('decFileUrls', [])
            decId = decision['decId']
            dec_text = ''
            if not url_list:
                # Nothing to download
                doc, _ = DocumentText.objects.get_or_create(
                    doc_id=decId, doc_type=COP_DECISION, url=None)
                doc.status = DocumentText.INDEXED
                doc.save()

            for url in url_list:
                doc, _ = DocumentText.objects.get_or_create(
                    doc_id=decId, doc_type=COP_DECISION, url=url)
                if doc.status == DocumentText.FULL_INDEXED:
                    dec_text += doc.text
                    logger.info('Already indexed %s' % url)
                else:
                    logger.info('Downloading: %s' % url)
                    file_obj = get_file_from_url(url)
                    if file_obj:
                        logger.debug('Success downloading %s' % url)
                        doc.text = self.solr.extract(file_obj)
                        dec_text += doc.text
                        doc.status = DocumentText.FULL_INDEXED
                        doc.doc_size = file_obj.getbuffer().nbytes
                        doc.save()
                        logger.info('Success extracting %s' % decId)
                    else:
                        logger.error('Error on file download %s' % decId)
                        doc.status = DocumentText.INDEXED
                        doc.save()

            decision['decText'] = dec_text
Ejemplo n.º 2
0
    def _index_files(self, treaties):
        for treaty in treaties:
            full_index = True
            treaty['trText'] = ''

            for field in URL_FIELDS:
                urls = treaty.get(FIELD_MAP[field], [])
                for url in urls:
                    logger.info('Downloading: %s' % url)
                    file_obj = get_file_from_url(url)
                    if file_obj:
                        # Download successful
                        try:
                            treaty['trText'] += self.solr.extract(file_obj)
                        except:
                            # SOLR error at pdf extraction
                            full_index = False
                            self._document_text_pdf_error(treaty, url)
                            logger.error('Error extracting from doc %s' %
                                         treaty['trElisId'])
                    else:
                        # Download failed
                        full_index = False
                        self._document_text_pdf_error(treaty, url)

            if full_index:
                logger.info('Success on file download %s' % treaty['trElisId'])
                self._document_text_pdf_success(treaty)
Ejemplo n.º 3
0
    def update_full_text(self):
        logger.info('[Treaty] Update full text started.')
        objs = DocumentText.objects.filter(status=DocumentText.INDEXED,
                                           doc_type=TREATY)
        for obj in objs:
            treaty_data = json.loads(obj.parsed_data)
            treaty_data['trText'] = ''
            full_index = True

            for field in URL_FIELDS:
                urls = treaty_data.get(FIELD_MAP[field], [])
                for url in urls:
                    logger.info('Downloading: %s' % url)
                    file_obj = get_file_from_url(url)
                    if file_obj:
                        # Download successful
                        try:
                            treaty_data['trText'] += self.solr.extract(
                                file_obj)
                        except:
                            # SOLR error at pdf extraction
                            full_index = False
                            obj.save()
                            logger.error('Error extracting from doc %s' %
                                         obj.doc_id)
                    else:
                        # Download failed
                        full_index = False
                        obj.save()
                        logger.error('Error downloading url from doc %s' %
                                     obj.doc_id)
            try:
                treaty = self.solr.search(TREATY, obj.doc_id)
                if treaty:
                    treaty_data['id'] = treaty['id']
            except SolrError as e:
                logger.error('Error reading treaty %s' % obj.doc_id)
                if settings.DEBUG:
                    logging.getLogger('solr').exception(e)
                continue

            if full_index:
                resp = self.solr.add(treaty_data)
                logger.info('Insert on %s' % (treaty_data['trElisId']))
                if resp:
                    obj.status = DocumentText.FULL_INDEXED
                    obj.parsed_data = ''
                    obj.save()
        logger.info('[Treaty] Update full text finished.')
Ejemplo n.º 4
0
def extract_text(solr, dec_id, urls):
    # Tuples consisting of: url, text, size and an "exists" flag
    # will be appended to this list. The entries will be used to
    # create missing entries in the sql database if the flag is False;
    # and to finally return the full concatenated text of all file content
    # for solr insertion.
    texts = []

    uniq_urls = set(urls)
    if not uniq_urls:
        logger.info('Decision %s has no files!', dec_id)

    # gather information about files
    for url in uniq_urls:
        logger.info('Extracting text from %s', url)
        document = has_document(dec_id, url)
        if document:
            # text exists, grab it from sql
            logger.info('Using existing text.')
            texts.append((url, document.text, document.doc_size, True))
        else:
            logger.info('Downloading file: %s', url)
            # text doesn't exist in sql, extract with solr
            try:
                file_obj = get_file_from_url(url)
                if file_obj:
                    logger.debug('Downloaded file: %s', url)
                    text = solr.extract(file_obj)
                    size = file_obj.getbuffer().nbytes
                    texts.append((url, text, size, False))
                    logger.info('Extracted file: %s', url)
            except Exception:
                logger.exception('Error extracting file: %s', url)

    for url, text, size, exists in texts:
        if not exists:
            create_document(dec_id, url, text, size)

    return ''.join((text for _, text, _, _ in texts))
Ejemplo n.º 5
0
    def update_full_text(self):
        logger.info('[Legislation] Update full text started.')
        while True:
            count = (DocumentText.objects.filter(
                status=DocumentText.INDEXED,
                doc_type=LEGISLATION).exclude(url__isnull=True)).count()
            objs = (DocumentText.objects.filter(
                status=DocumentText.INDEXED,
                doc_type=LEGISLATION).exclude(url__isnull=True))[:100]
            logger.info('%s records remaining' % (count, ))
            if count == 0:
                break
            for obj in objs:
                # Check if already parsed
                text = None
                if obj.doc_size and obj.text:
                    logger.info('Checking content length of %s (%s)' % (
                        obj.doc_id,
                        obj.url,
                    ))
                    doc_size = get_content_length_from_url(obj.url)
                    if doc_size == obj.doc_size:
                        # File not changed, reuse obj.text
                        logger.debug('Not changed: %s' % (obj.url, ))
                        text = obj.text

                # Download file
                if not text:
                    logger.info('Downloading: %s (%s)' % (
                        obj.doc_id,
                        obj.url,
                    ))
                    file_obj = get_file_from_url(obj.url)
                    if not file_obj:
                        logger.error('Failed downloading: %s' % (obj.url, ))
                        obj.status = DocumentText.FULL_INDEX_FAIL
                        obj.save()
                        continue
                    doc_size = file_obj.getbuffer().nbytes

                    # Extract text
                    logger.debug('Indexing: %s' % (obj.url, ))
                    text = self.solr.extract(file_obj)
                    if not text:
                        logger.warn('Nothing to index for %s' % (obj.url, ))

                # Load record and store text
                try:
                    legislation = self.solr.search(LEGISLATION, obj.doc_id)
                    if legislation:
                        legislation = cleanup_copyfields(legislation)
                except SolrError as e:
                    logger.error('Error reading legislation %s' %
                                 (obj.doc_id, ))
                    if settings.DEBUG:
                        logging.getLogger('solr').exception(e)
                    continue

                if not legislation:
                    logger.error('Failed to find legislation %s' %
                                 (obj.doc_id))
                    obj.status = DocumentText.FULL_INDEX_FAIL
                    obj.save()
                    continue

                legislation['legText'] = text
                result = self.solr.add(legislation)
                if result:
                    logger.info('Success download & indexed: %s' %
                                (obj.doc_id, ))
                    obj.status = DocumentText.FULL_INDEXED
                    obj.doc_size = doc_size
                    obj.text = text
                    obj.save()
                else:
                    logger.error('Failed doc extract %s %s' %
                                 (obj.url, legislation['id']))
        logger.info('[Legislation] Update full text finished.')
Ejemplo n.º 6
0
    def get_solr_format(self, informea_id, solr_id):
        solr_decision = {
            'cdText': '',
            'type': COURT_DECISION,
            'cdLeoId': informea_id,
            'id': solr_id,
            'cdCountry_en': [],
            'cdCountry_es': [],
            'cdCountry_fr': [],
            'cdLinkToFullText': [],
        }
        for json_field, solr_field in FIELD_MAP.items():
            json_value = self.data.get(json_field, None)
            if not json_value:
                solr_decision[solr_field] = (None
                                             if solr_field not in solr_decision
                                             else solr_decision[solr_field])
            elif json_field in REFERENCE_FIELDS:
                if json_field in FALSE_MULTILINGUAL_FIELDS:
                    json_value = json_value['und']
                solr_decision[solr_field] = [
                    e.get(REFERENCE_FIELDS[json_field]) for e in json_value
                ]
            elif json_field in FALSE_MULTILINGUAL_FIELDS:
                solr_decision[solr_field] = get_value(json_field,
                                                      json_value['und'])
            elif json_field in FIELD_URL:
                urls = [
                    x.get('url') for x in json_value.get('en', [])
                    if x.get('url')
                ]
                if solr_decision['cdLinkToFullText'] is None:
                    solr_decision['cdLinkToFullText'] = []
                for url in urls:
                    if url not in solr_decision['cdLinkToFullText']:
                        solr_decision['cdLinkToFullText'].append(url)
            elif json_field in TIMESTAMP_FIELDS:
                date_value = datetime.fromtimestamp(float(json_value))
                date_string = date_value.strftime(SOLR_DATE_FORMAT)
                solr_decision[solr_field] = date_string
            elif json_field in MULTILINGUAL_FIELDS:
                for lang, value in json_value.items():
                    if lang in settings.LANGUAGE_MAP:
                        key = '{}_{}'.format(solr_field, lang)
                        solr_decision[key] = get_value(json_field, value)
            elif json_field in COUNTRY_FIELDS:
                for country_code in json_value:
                    countries = self.countries.get(country_code, {})
                    for lang, country_name in countries.items():
                        key = '{}_{}'.format(solr_field, lang)
                        solr_decision[key].append(country_name)
            elif json_field in LANGUAGE_FIELDS:
                language_code = get_value(json_field, json_value['und'])
                if language_code not in self.languages:
                    logger.warning(
                        'Language code missing from languages.json: '
                        '{} ({})'.format(language_code, informea_id))
                    continue
                languages = self.languages[language_code]
                for lang in LANGUAGES:
                    field = '{}_{}'.format(solr_field, lang)
                    if languages:
                        solr_decision[field] = languages[lang]
                    else:
                        solr_decision[field] = language_code
            elif json_field in SUBDIVISION_FIELDS:
                subdivision_en = get_value(json_field, json_value)
                solr_decision[solr_field + '_en'] = subdivision_en
                values = self.subdivisions.get(subdivision_en.lower(), None)
                if values:
                    solr_decision[solr_field + '_es'] = values['es']
                    solr_decision[solr_field + '_fr'] = values['fr']
                else:
                    logger.warning('Subdivision missing from json: '
                                   '{} ({})'.format(subdivision_en,
                                                    informea_id))
            elif json_field in REGION_FIELDS:
                reg_dict = get_json_values(json_field, json_value,
                                           self.regions, 'regions',
                                           informea_id)
                for lang, regions in reg_dict.items():
                    solr_decision['{}_{}'.format(solr_field, lang)] = regions
            elif json_field in KEYWORD_FIELDS:
                kw_dict = get_json_values(json_field, json_value,
                                          self.keywords, 'keywords',
                                          informea_id)
                for lang, keywords in kw_dict.items():
                    keywords = list(set(keywords))
                    solr_decision['{}_{}'.format(solr_field, lang)] = keywords
            elif json_field in SUBJECT_FIELDS:
                sbj_dict = get_json_values(json_field, json_value,
                                           self.subjects, 'subjects',
                                           informea_id)
                for lang, subjects in sbj_dict.items():
                    subjects = list(set(subjects))
                    solr_decision['{}_{}'.format(solr_field, lang)] = subjects
            else:
                solr_decision[solr_field] = get_value(json_field, json_value)

            if json_field in FULL_TEXT_FIELDS and json_value:
                urls = [
                    replace_url(d.get('url')) for val in json_value.values()
                    for d in val
                ]
                files = [get_file_from_url(url) for url in urls if url]
                solr_decision['cdText'] += '\n'.join(
                    self.solr.extract(f) for f in files if f)
        # cdRegion fallback on field_ecolex_region
        if not solr_decision.get('cdRegion_en'):
            backup_field = 'field_ecolex_region'
            solr_field = 'cdRegion'
            json_value = self.data.get(backup_field, None)
            if json_value:
                reg_dict = get_json_values(backup_field, json_value,
                                           self.regions, 'regions',
                                           informea_id)
                for lang, regions in reg_dict.items():
                    solr_decision['{}_{}'.format(solr_field, lang)] = regions

        full_text_urls = solr_decision.get('cdLinkToFullText') or []
        if not full_text_urls and solr_decision.get('cdRelatedUrl_en'):
            url = solr_decision.pop('cdRelatedUrl_en')
            solr_decision['cdLinkToFullText'] = [url]
            full_text_urls.append(url)

        for url in full_text_urls:
            file_obj = get_file_from_url(url)
            if file_obj:
                solr_decision['cdText'] += '\n'.join(
                    self.solr.extract(file_obj))

        # Get Leo URL
        json_value = self.data.get(SOURCE_URL_FIELD, None)
        if json_value:
            solr_decision['cdLeoDefaultUrl'] = json_value.get('default', None)
            solr_decision['cdLeoEnglishUrl'] = json_value.get('en', None)

        title = (solr_decision.get('cdTitleOfText_en')
                 or solr_decision.get('cdTitleOfText_fr')
                 or solr_decision.get('cdTitleOfText_es')
                 or solr_decision.get('cdTitleOfText_other') or '')
        if not title:
            logger.warning('Title missing for {}'.format(informea_id))
        slug = '{} {}'.format(title, informea_id)
        solr_decision['slug'] = slugify(slug)
        solr_decision['updatedDate'] = (
            datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ'))

        if 'cdKeyword_en' not in solr_decision:
            solr_decision.update({
                'cdKeyword_{}'.format(lang): self.informea_tags(lang)
                for lang in LANGUAGES
            })

        return solr_decision