def _index_files(self, decisions): for decision in decisions: url_list = decision.get('decFileUrls', []) decId = decision['decId'] dec_text = '' if not url_list: # Nothing to download doc, _ = DocumentText.objects.get_or_create( doc_id=decId, doc_type=COP_DECISION, url=None) doc.status = DocumentText.INDEXED doc.save() for url in url_list: doc, _ = DocumentText.objects.get_or_create( doc_id=decId, doc_type=COP_DECISION, url=url) if doc.status == DocumentText.FULL_INDEXED: dec_text += doc.text logger.info('Already indexed %s' % url) else: logger.info('Downloading: %s' % url) file_obj = get_file_from_url(url) if file_obj: logger.debug('Success downloading %s' % url) doc.text = self.solr.extract(file_obj) dec_text += doc.text doc.status = DocumentText.FULL_INDEXED doc.doc_size = file_obj.getbuffer().nbytes doc.save() logger.info('Success extracting %s' % decId) else: logger.error('Error on file download %s' % decId) doc.status = DocumentText.INDEXED doc.save() decision['decText'] = dec_text
def _index_files(self, treaties): for treaty in treaties: full_index = True treaty['trText'] = '' for field in URL_FIELDS: urls = treaty.get(FIELD_MAP[field], []) for url in urls: logger.info('Downloading: %s' % url) file_obj = get_file_from_url(url) if file_obj: # Download successful try: treaty['trText'] += self.solr.extract(file_obj) except: # SOLR error at pdf extraction full_index = False self._document_text_pdf_error(treaty, url) logger.error('Error extracting from doc %s' % treaty['trElisId']) else: # Download failed full_index = False self._document_text_pdf_error(treaty, url) if full_index: logger.info('Success on file download %s' % treaty['trElisId']) self._document_text_pdf_success(treaty)
def update_full_text(self): logger.info('[Treaty] Update full text started.') objs = DocumentText.objects.filter(status=DocumentText.INDEXED, doc_type=TREATY) for obj in objs: treaty_data = json.loads(obj.parsed_data) treaty_data['trText'] = '' full_index = True for field in URL_FIELDS: urls = treaty_data.get(FIELD_MAP[field], []) for url in urls: logger.info('Downloading: %s' % url) file_obj = get_file_from_url(url) if file_obj: # Download successful try: treaty_data['trText'] += self.solr.extract( file_obj) except: # SOLR error at pdf extraction full_index = False obj.save() logger.error('Error extracting from doc %s' % obj.doc_id) else: # Download failed full_index = False obj.save() logger.error('Error downloading url from doc %s' % obj.doc_id) try: treaty = self.solr.search(TREATY, obj.doc_id) if treaty: treaty_data['id'] = treaty['id'] except SolrError as e: logger.error('Error reading treaty %s' % obj.doc_id) if settings.DEBUG: logging.getLogger('solr').exception(e) continue if full_index: resp = self.solr.add(treaty_data) logger.info('Insert on %s' % (treaty_data['trElisId'])) if resp: obj.status = DocumentText.FULL_INDEXED obj.parsed_data = '' obj.save() logger.info('[Treaty] Update full text finished.')
def extract_text(solr, dec_id, urls): # Tuples consisting of: url, text, size and an "exists" flag # will be appended to this list. The entries will be used to # create missing entries in the sql database if the flag is False; # and to finally return the full concatenated text of all file content # for solr insertion. texts = [] uniq_urls = set(urls) if not uniq_urls: logger.info('Decision %s has no files!', dec_id) # gather information about files for url in uniq_urls: logger.info('Extracting text from %s', url) document = has_document(dec_id, url) if document: # text exists, grab it from sql logger.info('Using existing text.') texts.append((url, document.text, document.doc_size, True)) else: logger.info('Downloading file: %s', url) # text doesn't exist in sql, extract with solr try: file_obj = get_file_from_url(url) if file_obj: logger.debug('Downloaded file: %s', url) text = solr.extract(file_obj) size = file_obj.getbuffer().nbytes texts.append((url, text, size, False)) logger.info('Extracted file: %s', url) except Exception: logger.exception('Error extracting file: %s', url) for url, text, size, exists in texts: if not exists: create_document(dec_id, url, text, size) return ''.join((text for _, text, _, _ in texts))
def update_full_text(self): logger.info('[Legislation] Update full text started.') while True: count = (DocumentText.objects.filter( status=DocumentText.INDEXED, doc_type=LEGISLATION).exclude(url__isnull=True)).count() objs = (DocumentText.objects.filter( status=DocumentText.INDEXED, doc_type=LEGISLATION).exclude(url__isnull=True))[:100] logger.info('%s records remaining' % (count, )) if count == 0: break for obj in objs: # Check if already parsed text = None if obj.doc_size and obj.text: logger.info('Checking content length of %s (%s)' % ( obj.doc_id, obj.url, )) doc_size = get_content_length_from_url(obj.url) if doc_size == obj.doc_size: # File not changed, reuse obj.text logger.debug('Not changed: %s' % (obj.url, )) text = obj.text # Download file if not text: logger.info('Downloading: %s (%s)' % ( obj.doc_id, obj.url, )) file_obj = get_file_from_url(obj.url) if not file_obj: logger.error('Failed downloading: %s' % (obj.url, )) obj.status = DocumentText.FULL_INDEX_FAIL obj.save() continue doc_size = file_obj.getbuffer().nbytes # Extract text logger.debug('Indexing: %s' % (obj.url, )) text = self.solr.extract(file_obj) if not text: logger.warn('Nothing to index for %s' % (obj.url, )) # Load record and store text try: legislation = self.solr.search(LEGISLATION, obj.doc_id) if legislation: legislation = cleanup_copyfields(legislation) except SolrError as e: logger.error('Error reading legislation %s' % (obj.doc_id, )) if settings.DEBUG: logging.getLogger('solr').exception(e) continue if not legislation: logger.error('Failed to find legislation %s' % (obj.doc_id)) obj.status = DocumentText.FULL_INDEX_FAIL obj.save() continue legislation['legText'] = text result = self.solr.add(legislation) if result: logger.info('Success download & indexed: %s' % (obj.doc_id, )) obj.status = DocumentText.FULL_INDEXED obj.doc_size = doc_size obj.text = text obj.save() else: logger.error('Failed doc extract %s %s' % (obj.url, legislation['id'])) logger.info('[Legislation] Update full text finished.')
def get_solr_format(self, informea_id, solr_id): solr_decision = { 'cdText': '', 'type': COURT_DECISION, 'cdLeoId': informea_id, 'id': solr_id, 'cdCountry_en': [], 'cdCountry_es': [], 'cdCountry_fr': [], 'cdLinkToFullText': [], } for json_field, solr_field in FIELD_MAP.items(): json_value = self.data.get(json_field, None) if not json_value: solr_decision[solr_field] = (None if solr_field not in solr_decision else solr_decision[solr_field]) elif json_field in REFERENCE_FIELDS: if json_field in FALSE_MULTILINGUAL_FIELDS: json_value = json_value['und'] solr_decision[solr_field] = [ e.get(REFERENCE_FIELDS[json_field]) for e in json_value ] elif json_field in FALSE_MULTILINGUAL_FIELDS: solr_decision[solr_field] = get_value(json_field, json_value['und']) elif json_field in FIELD_URL: urls = [ x.get('url') for x in json_value.get('en', []) if x.get('url') ] if solr_decision['cdLinkToFullText'] is None: solr_decision['cdLinkToFullText'] = [] for url in urls: if url not in solr_decision['cdLinkToFullText']: solr_decision['cdLinkToFullText'].append(url) elif json_field in TIMESTAMP_FIELDS: date_value = datetime.fromtimestamp(float(json_value)) date_string = date_value.strftime(SOLR_DATE_FORMAT) solr_decision[solr_field] = date_string elif json_field in MULTILINGUAL_FIELDS: for lang, value in json_value.items(): if lang in settings.LANGUAGE_MAP: key = '{}_{}'.format(solr_field, lang) solr_decision[key] = get_value(json_field, value) elif json_field in COUNTRY_FIELDS: for country_code in json_value: countries = self.countries.get(country_code, {}) for lang, country_name in countries.items(): key = '{}_{}'.format(solr_field, lang) solr_decision[key].append(country_name) elif json_field in LANGUAGE_FIELDS: language_code = get_value(json_field, json_value['und']) if language_code not in self.languages: logger.warning( 'Language code missing from languages.json: ' '{} ({})'.format(language_code, informea_id)) continue languages = self.languages[language_code] for lang in LANGUAGES: field = '{}_{}'.format(solr_field, lang) if languages: solr_decision[field] = languages[lang] else: solr_decision[field] = language_code elif json_field in SUBDIVISION_FIELDS: subdivision_en = get_value(json_field, json_value) solr_decision[solr_field + '_en'] = subdivision_en values = self.subdivisions.get(subdivision_en.lower(), None) if values: solr_decision[solr_field + '_es'] = values['es'] solr_decision[solr_field + '_fr'] = values['fr'] else: logger.warning('Subdivision missing from json: ' '{} ({})'.format(subdivision_en, informea_id)) elif json_field in REGION_FIELDS: reg_dict = get_json_values(json_field, json_value, self.regions, 'regions', informea_id) for lang, regions in reg_dict.items(): solr_decision['{}_{}'.format(solr_field, lang)] = regions elif json_field in KEYWORD_FIELDS: kw_dict = get_json_values(json_field, json_value, self.keywords, 'keywords', informea_id) for lang, keywords in kw_dict.items(): keywords = list(set(keywords)) solr_decision['{}_{}'.format(solr_field, lang)] = keywords elif json_field in SUBJECT_FIELDS: sbj_dict = get_json_values(json_field, json_value, self.subjects, 'subjects', informea_id) for lang, subjects in sbj_dict.items(): subjects = list(set(subjects)) solr_decision['{}_{}'.format(solr_field, lang)] = subjects else: solr_decision[solr_field] = get_value(json_field, json_value) if json_field in FULL_TEXT_FIELDS and json_value: urls = [ replace_url(d.get('url')) for val in json_value.values() for d in val ] files = [get_file_from_url(url) for url in urls if url] solr_decision['cdText'] += '\n'.join( self.solr.extract(f) for f in files if f) # cdRegion fallback on field_ecolex_region if not solr_decision.get('cdRegion_en'): backup_field = 'field_ecolex_region' solr_field = 'cdRegion' json_value = self.data.get(backup_field, None) if json_value: reg_dict = get_json_values(backup_field, json_value, self.regions, 'regions', informea_id) for lang, regions in reg_dict.items(): solr_decision['{}_{}'.format(solr_field, lang)] = regions full_text_urls = solr_decision.get('cdLinkToFullText') or [] if not full_text_urls and solr_decision.get('cdRelatedUrl_en'): url = solr_decision.pop('cdRelatedUrl_en') solr_decision['cdLinkToFullText'] = [url] full_text_urls.append(url) for url in full_text_urls: file_obj = get_file_from_url(url) if file_obj: solr_decision['cdText'] += '\n'.join( self.solr.extract(file_obj)) # Get Leo URL json_value = self.data.get(SOURCE_URL_FIELD, None) if json_value: solr_decision['cdLeoDefaultUrl'] = json_value.get('default', None) solr_decision['cdLeoEnglishUrl'] = json_value.get('en', None) title = (solr_decision.get('cdTitleOfText_en') or solr_decision.get('cdTitleOfText_fr') or solr_decision.get('cdTitleOfText_es') or solr_decision.get('cdTitleOfText_other') or '') if not title: logger.warning('Title missing for {}'.format(informea_id)) slug = '{} {}'.format(title, informea_id) solr_decision['slug'] = slugify(slug) solr_decision['updatedDate'] = ( datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')) if 'cdKeyword_en' not in solr_decision: solr_decision.update({ 'cdKeyword_{}'.format(lang): self.informea_tags(lang) for lang in LANGUAGES }) return solr_decision