def extract_description(self, line): desc_0 = content_prettify('<p>{}</p>'.format(line['baseline'])) desc_1 = content_prettify('<p>Nous (la BPI) {}</p>'.format( line['introduction_us'])) desc_2 = content_prettify(line['we']) description = desc_0 + desc_1 + desc_2 return description
def extract_description(self, line): # desc_1 & desc_2 already have <p></p> tags desc_1 = content_prettify(line.get('field_chapo', '')) if 'Qui peut en bénéficier ?' in line.get('field_paragraphes', ''): desc_2 = str( line.get('field_paragraphes', '').partition('Qui peut en bénéficier ?')[0]) desc_2 = content_prettify(desc_2) else: desc_2 = content_prettify(line.get('field_paragraphes', '')) description = desc_1 + desc_2 return description
def clean(self, value): """Sanitize the html.""" cleaned = super().clean(value) return content_prettify( cleaned, more_allowed_tags=['a', 'blockquote', 'br', 'header', 'footer']).strip()
def custom_prettify(raw_text): """Prettify content and performs some specific cleaning tasks.""" soup = bs(raw_text, features='html.parser') download_box = soup.select('div.boite_telechargements_libre') for box in download_box: box.decompose() return content_prettify(soup.prettify())
def aid_parse(self, response): title = response.xpath('//meta[@property="og:title"]/@content').get() description = response.css('div.pf-content').get() contact = response.css('div.know').get() current_url = response.request.url unique_id = current_url.split('/')[-2] yield { 'title': content_prettify(title), 'description': content_prettify(description), 'current_url': current_url, 'uniqueid': unique_id, 'contact': content_prettify(contact), 'category': response.meta['category'], 'is_call_for_project': response.meta['is_call_for_project'], 'submission_deadline': response.meta['submission_deadline'], }
def extract_description(self, line): # The reference and description are merged with no space in the # same column. Why? I don't know. val = line["Référence + Description"] aid_re = r'^[A-Z]{3}_\d_\d' if re.match(aid_re, val): raw_description = val[7:] else: raw_description = ' '.join(val.split(' ')[1:]) return content_prettify(raw_description)
def clean(self, value): """Sanitize the html.""" cleaned = super().clean(value) extra_tags = ['a', 'blockquote', 'br', 'header', 'footer', 'img'] extra_attrs = ['style'] return content_prettify( cleaned, more_allowed_tags=extra_tags, more_allowed_attrs=extra_attrs, ).strip()
def extract_contact(self, line): direction = line.get('direction', '') service = line.get('service', '') pole = line.get('pole', '') contact_1 = '<p>' + '<br />'.join([direction, service, pole]) + '</p>' contact_name = line.get('contact_prenom', '') + line.get( 'contact_nom', '') contact_email = line.get('contact_email', '') contact_phone = line.get('tel', '') contact_2 = '<p>' + '<br />'.join( [contact_name, contact_email, contact_phone]) + '</p>' contact_detail = content_prettify(line.get('informations_contact', '')) return contact_1 + contact_2 + contact_detail
def aid_parse(self, response): title = response.css('h1.weaurmc-publication-title::text').get() description = response.xpath( '//meta[@name="description"]/@content').get() current_url = response.request.url unique_id = current_url.split('/')[-3] yield { 'title': title, 'description': content_prettify(description), 'current_url': current_url, 'uniqueid': unique_id, }
def test_prettify_creates_absolute_urls(): # Relative urls are made absolute html = ''' This is a <a href="/toto.html">long text with a link</a> ''' text = content_prettify(html, base_url='https://www.example.org') assert '<a href="https://www.example.org/toto.html">' in text # Absolute urls are left untouched html = ''' This is a <a href="https://www.example.org/toto.html">long text with a link</a> ''' text = content_prettify(html, base_url='https://www.example.org') assert '<a href="https://www.example.org/toto.html">' in text # Urls for other domains are left untouched html = ''' This is a <a href="https://www.example.com/toto.html">long text with a link</a> ''' text = content_prettify(html, base_url='https://www.example.org') assert '<a href="https://www.example.com/toto.html">' in text
def aid_parse(self, response): title = response.css('h1.main-content__title::text').get() description = response.css('div.main-content__texte').get() fields = response.meta['fields'] date_updated_string = fields['date_modification'] date_updated_format = '%Y-%m-%d %H:%M:%S' date_updated = datetime.strptime(date_updated_string, date_updated_format) yield { 'title': content_prettify(title), 'description': custom_prettify(description), 'url': fields['url'], 'uniqueid': response.meta['uniqueid'], 'thematique': fields.get('thematique', None), 'type': fields.get('type', None), 'date_updated': date_updated, }
def extract_eligibility(self, line): if 'Qui peut en bénéficier ?' in line.get('field_paragraphes', ''): eligibility_part_1 = str( line.get('field_paragraphes', '').partition('Qui peut en bénéficier ?')[1]) eligibility_part_2 = str( line.get('field_paragraphes', '').partition('Qui peut en bénéficier ?')[2]) eligibility = eligibility_part_1 + eligibility_part_2 else: eligibility = '' if 'Liens utiles et contacts' in eligibility: eligibility = str( eligibility.partition('Liens utiles et contacts')[0]) elif 'Documents utiles' in eligibility: eligibility = str(eligibility.partition('Documents utiles')[0]) elif 'Liens utiles' in eligibility: eligibility = str(eligibility.partition('Liens utiles')[0]) eligibility = content_prettify(eligibility) return eligibility
def extract_contact(self, line): if 'Liens utiles et contacts' in line.get('field_paragraphes', ''): contact = str( line.get('field_paragraphes', '').partition('Liens utiles et contacts')[2]) elif 'Documents utiles' in line.get('field_paragraphes', ''): contact = str( line.get('field_paragraphes', '').partition('Documents utiles')[2]) elif 'Liens utiles' in line.get('field_paragraphes', ''): contact = str( line.get('field_paragraphes', '').partition('Liens utiles')[2]) else: contact = '' if 'Mise en ligne' in contact: contact = str(contact.partition('Mise en ligne')[0]) elif 'Mis à jour' in contact: contact = str(contact.partition('Mis à jour')[0]) contact_detail = content_prettify(contact) return contact_detail
def clean(self, value): """Sanitize the html.""" cleaned = super().clean(value) return content_prettify(cleaned, more_allowed_tags=['a']).strip()
def extract_description(self, line): description = content_prettify(line['description']) return description
def extract_description(self, line): description = content_prettify(line['objet']) return content_prettify(description)
def extract_eligibility(self, line): eligibility = line['publicsBeneficiairesDetails'] return content_prettify(eligibility)
def extract_description(self, line): # desc_1 = content_prettify(line.get('gui_introduction', '')) description = content_prettify(line.get('post_content', '')) # description = desc_1 + desc_2 return description
def extract_eligibility(self, line): elig_1 = content_prettify('<p>Vous {}</p>'.format( line['introduction_you'])) elig_2 = content_prettify(line['you']) eligibility = elig_1 + elig_2 return eligibility
def extract_description(self, line): description = content_prettify(line.find('presentation').text) return description
def extract_eligibility(self, line): # eligibility already has <p></p> tags eligibility = content_prettify(line.get('aidconditions', '')) return eligibility
def extract_description(self, line): # desc_1 & desc_2 already have <p></p> tags desc_1 = content_prettify(line.get('aid_objet', '')) desc_2 = content_prettify(line.get('aid_operations_ei', '')) description = desc_1 + desc_2 return description
def aid_parse(self, response): title = response.css('h1.headline-aide::text').get().strip() subtitle = response.css('div.mod-chapo').get().strip() categorie = response.css( 'ul.m-breadcrumb__list > li:nth-child(2) > span::text').get() is_call_for_project = False if response.css('span.tag-appel-projet').get() or response.css( 'span.tag-appel-manifestation').get(): is_call_for_project = True is_dispositif_europe = False if response.css('span.tag-europe').get(): is_dispositif_europe = True aid_header = { 'publics_concernes': '', # targeted_audiences 'domaines_secondaires': '', # categories 'date_de_fin_de_publication': '' # submission_deadline } for index, item in enumerate(response.css('div.categories')): aid_header_key = list(aid_header.keys())[index] aid_header[aid_header_key] = content_prettify( item.css('p::text').get()).strip().replace('\n', '').replace( ' ', '').replace(' , ', ';') aid_details = { 'objectifs': '', 'calendrier': '', 'beneficiaires': '', 'montant': '', 'criteres': '', 'modalites': '' } for index, item in enumerate( response.css('div.dispositif-aide > div.mod-textSimple')): aid_details_key = list(aid_details.keys())[index] aid_details[aid_details_key] = content_prettify(item.get()) contact = response.css('div.contact-adresses').get() current_url = response.request.url yield { 'title': title, 'description': subtitle + '<br />' + content_prettify(aid_details['objectifs']), 'categorie': categorie, 'is_call_for_project': is_call_for_project, 'is_dispositif_europe': is_dispositif_europe, **aid_header, **aid_details, 'contact': content_prettify(contact), 'pub_date': response.meta['pub_date'], 'current_url': current_url, }