def extract_description(self, line):
     desc_0 = content_prettify('<p>{}</p>'.format(line['baseline']))
     desc_1 = content_prettify('<p>Nous (la BPI) {}</p>'.format(
         line['introduction_us']))
     desc_2 = content_prettify(line['we'])
     description = desc_0 + desc_1 + desc_2
     return description
 def extract_description(self, line):
     # desc_1 & desc_2 already have <p></p> tags
     desc_1 = content_prettify(line.get('field_chapo', ''))
     if 'Qui peut en bénéficier ?' in line.get('field_paragraphes', ''):
         desc_2 = str(
             line.get('field_paragraphes',
                      '').partition('Qui peut en bénéficier ?')[0])
         desc_2 = content_prettify(desc_2)
     else:
         desc_2 = content_prettify(line.get('field_paragraphes', ''))
     description = desc_1 + desc_2
     return description
    def clean(self, value):
        """Sanitize the html."""

        cleaned = super().clean(value)
        return content_prettify(
            cleaned,
            more_allowed_tags=['a', 'blockquote', 'br', 'header',
                               'footer']).strip()
Exemple #4
0
def custom_prettify(raw_text):
    """Prettify content and performs some specific cleaning tasks."""
    soup = bs(raw_text, features='html.parser')
    download_box = soup.select('div.boite_telechargements_libre')
    for box in download_box:
        box.decompose()

    return content_prettify(soup.prettify())
Exemple #5
0
    def aid_parse(self, response):
        title = response.xpath('//meta[@property="og:title"]/@content').get()
        description = response.css('div.pf-content').get()
        contact = response.css('div.know').get()

        current_url = response.request.url
        unique_id = current_url.split('/')[-2]

        yield {
            'title': content_prettify(title),
            'description': content_prettify(description),
            'current_url': current_url,
            'uniqueid': unique_id,
            'contact': content_prettify(contact),
            'category': response.meta['category'],
            'is_call_for_project': response.meta['is_call_for_project'],
            'submission_deadline': response.meta['submission_deadline'],
        }
Exemple #6
0
 def extract_description(self, line):
     # The reference and description are merged with no space in the
     # same column. Why? I don't know.
     val = line["Référence + Description"]
     aid_re = r'^[A-Z]{3}_\d_\d'
     if re.match(aid_re, val):
         raw_description = val[7:]
     else:
         raw_description = ' '.join(val.split(' ')[1:])
     return content_prettify(raw_description)
Exemple #7
0
    def clean(self, value):
        """Sanitize the html."""

        cleaned = super().clean(value)
        extra_tags = ['a', 'blockquote', 'br', 'header', 'footer', 'img']
        extra_attrs = ['style']
        return content_prettify(
            cleaned,
            more_allowed_tags=extra_tags,
            more_allowed_attrs=extra_attrs,
        ).strip()
Exemple #8
0
 def extract_contact(self, line):
     direction = line.get('direction', '')
     service = line.get('service', '')
     pole = line.get('pole', '')
     contact_1 = '<p>' + '<br />'.join([direction, service, pole]) + '</p>'
     contact_name = line.get('contact_prenom', '') + line.get(
         'contact_nom', '')
     contact_email = line.get('contact_email', '')
     contact_phone = line.get('tel', '')
     contact_2 = '<p>' + '<br />'.join(
         [contact_name, contact_email, contact_phone]) + '</p>'
     contact_detail = content_prettify(line.get('informations_contact', ''))
     return contact_1 + contact_2 + contact_detail
Exemple #9
0
    def aid_parse(self, response):
        title = response.css('h1.weaurmc-publication-title::text').get()
        description = response.xpath(
            '//meta[@name="description"]/@content').get()

        current_url = response.request.url
        unique_id = current_url.split('/')[-3]

        yield {
            'title': title,
            'description': content_prettify(description),
            'current_url': current_url,
            'uniqueid': unique_id,
        }
def test_prettify_creates_absolute_urls():

    # Relative urls are made absolute
    html = '''
    This is a <a href="/toto.html">long text with a link</a>
    '''
    text = content_prettify(html, base_url='https://www.example.org')
    assert '<a href="https://www.example.org/toto.html">' in text

    # Absolute urls are left untouched
    html = '''
    This is a <a href="https://www.example.org/toto.html">long text
    with a link</a>
    '''
    text = content_prettify(html, base_url='https://www.example.org')
    assert '<a href="https://www.example.org/toto.html">' in text

    # Urls for other domains are left untouched
    html = '''
    This is a <a href="https://www.example.com/toto.html">long text
    with a link</a>
    '''
    text = content_prettify(html, base_url='https://www.example.org')
    assert '<a href="https://www.example.com/toto.html">' in text
Exemple #11
0
    def aid_parse(self, response):
        title = response.css('h1.main-content__title::text').get()
        description = response.css('div.main-content__texte').get()
        fields = response.meta['fields']
        date_updated_string = fields['date_modification']
        date_updated_format = '%Y-%m-%d %H:%M:%S'
        date_updated = datetime.strptime(date_updated_string,
                                         date_updated_format)

        yield {
            'title': content_prettify(title),
            'description': custom_prettify(description),
            'url': fields['url'],
            'uniqueid': response.meta['uniqueid'],
            'thematique': fields.get('thematique', None),
            'type': fields.get('type', None),
            'date_updated': date_updated,
        }
    def extract_eligibility(self, line):
        if 'Qui peut en bénéficier ?' in line.get('field_paragraphes', ''):
            eligibility_part_1 = str(
                line.get('field_paragraphes',
                         '').partition('Qui peut en bénéficier ?')[1])
            eligibility_part_2 = str(
                line.get('field_paragraphes',
                         '').partition('Qui peut en bénéficier ?')[2])
            eligibility = eligibility_part_1 + eligibility_part_2
        else:
            eligibility = ''

        if 'Liens utiles et contacts' in eligibility:
            eligibility = str(
                eligibility.partition('Liens utiles et contacts')[0])
        elif 'Documents utiles' in eligibility:
            eligibility = str(eligibility.partition('Documents utiles')[0])
        elif 'Liens utiles' in eligibility:
            eligibility = str(eligibility.partition('Liens utiles')[0])

        eligibility = content_prettify(eligibility)
        return eligibility
    def extract_contact(self, line):
        if 'Liens utiles et contacts' in line.get('field_paragraphes', ''):
            contact = str(
                line.get('field_paragraphes',
                         '').partition('Liens utiles et contacts')[2])
        elif 'Documents utiles' in line.get('field_paragraphes', ''):
            contact = str(
                line.get('field_paragraphes',
                         '').partition('Documents utiles')[2])
        elif 'Liens utiles' in line.get('field_paragraphes', ''):
            contact = str(
                line.get('field_paragraphes', '').partition('Liens utiles')[2])
        else:
            contact = ''

        if 'Mise en ligne' in contact:
            contact = str(contact.partition('Mise en ligne')[0])
        elif 'Mis à jour' in contact:
            contact = str(contact.partition('Mis à jour')[0])

        contact_detail = content_prettify(contact)
        return contact_detail
Exemple #14
0
    def clean(self, value):
        """Sanitize the html."""

        cleaned = super().clean(value)
        return content_prettify(cleaned, more_allowed_tags=['a']).strip()
 def extract_description(self, line):
     description = content_prettify(line['description'])
     return description
 def extract_description(self, line):
     description = content_prettify(line['objet'])
     return content_prettify(description)
 def extract_eligibility(self, line):
     eligibility = line['publicsBeneficiairesDetails']
     return content_prettify(eligibility)
 def extract_description(self, line):
     # desc_1 = content_prettify(line.get('gui_introduction', ''))
     description = content_prettify(line.get('post_content', ''))
     # description = desc_1 + desc_2
     return description
 def extract_eligibility(self, line):
     elig_1 = content_prettify('<p>Vous {}</p>'.format(
         line['introduction_you']))
     elig_2 = content_prettify(line['you'])
     eligibility = elig_1 + elig_2
     return eligibility
 def extract_description(self, line):
     description = content_prettify(line.find('presentation').text)
     return description
Exemple #21
0
 def extract_eligibility(self, line):
     # eligibility already has <p></p> tags
     eligibility = content_prettify(line.get('aidconditions', ''))
     return eligibility
Exemple #22
0
 def extract_description(self, line):
     # desc_1 & desc_2 already have <p></p> tags
     desc_1 = content_prettify(line.get('aid_objet', ''))
     desc_2 = content_prettify(line.get('aid_operations_ei', ''))
     description = desc_1 + desc_2
     return description
Exemple #23
0
    def aid_parse(self, response):
        title = response.css('h1.headline-aide::text').get().strip()
        subtitle = response.css('div.mod-chapo').get().strip()

        categorie = response.css(
            'ul.m-breadcrumb__list > li:nth-child(2) > span::text').get()

        is_call_for_project = False
        if response.css('span.tag-appel-projet').get() or response.css(
                'span.tag-appel-manifestation').get():
            is_call_for_project = True
        is_dispositif_europe = False
        if response.css('span.tag-europe').get():
            is_dispositif_europe = True

        aid_header = {
            'publics_concernes': '',  # targeted_audiences
            'domaines_secondaires': '',  # categories
            'date_de_fin_de_publication': ''  # submission_deadline
        }
        for index, item in enumerate(response.css('div.categories')):
            aid_header_key = list(aid_header.keys())[index]
            aid_header[aid_header_key] = content_prettify(
                item.css('p::text').get()).strip().replace('\n', '').replace(
                    '   ', '').replace('  ,  ', ';')

        aid_details = {
            'objectifs': '',
            'calendrier': '',
            'beneficiaires': '',
            'montant': '',
            'criteres': '',
            'modalites': ''
        }
        for index, item in enumerate(
                response.css('div.dispositif-aide > div.mod-textSimple')):
            aid_details_key = list(aid_details.keys())[index]
            aid_details[aid_details_key] = content_prettify(item.get())

        contact = response.css('div.contact-adresses').get()

        current_url = response.request.url

        yield {
            'title':
            title,
            'description':
            subtitle + '<br />' + content_prettify(aid_details['objectifs']),
            'categorie':
            categorie,
            'is_call_for_project':
            is_call_for_project,
            'is_dispositif_europe':
            is_dispositif_europe,
            **aid_header,
            **aid_details,
            'contact':
            content_prettify(contact),
            'pub_date':
            response.meta['pub_date'],
            'current_url':
            current_url,
        }