def _format_body_content(self, article, body_content):
        nitf_body = []

        if article.get('ednote'):
            nitf_body.append(to_ascii(self._format_line(article.get('ednote'))))

        if article.get(BYLINE):
            nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE)))))

        if article.get(FORMAT) == FORMATS.PRESERVED:
            nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html')))
        else:
            body = article.get('body_html', '')
            # we need to inject the dateline
            if article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                body_html_elem = parse_html(article.get('body_html'))
                ptag = body_html_elem.find('.//p')
                if ptag is not None:
                    ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '')
                    body = to_string(body_html_elem)

            nitf_body.append(self.get_text_content(body))
            if article.get('body_footer'):
                nitf_body.append(self.get_text_content(article.get('body_footer', '')))

        sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip()
        if sign_off:
            nitf_body.append(to_ascii(self._format_line(sign_off)))

        SubElement(body_content, 'pre').text = ''.join(nitf_body)
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.HTML,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, body_data = data.split('\n\n', 2)

            self._process_header(item, header)

            start_of_body = 'MEDIA RELEASE '
            source, data = data.split(start_of_body, 1)
            data = start_of_body + data

            item['anpa_category'] = [{'qcode': 'j'}]
            item['original_source'] = 'AsiaNet'
            body_html = to_ascii(html.escape(data)).replace('\n\n',
                                                            '</p><p>').replace(
                                                                '\n', ' ')
            item['body_html'] = '<p>' + body_html + '</p>'
            item['word_count'] = get_word_count(item['body_html'])

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
Exemple #3
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.HTML,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, body_data = data.split('\n\n', 2)

            self._process_header(item, header)

            start_of_body = 'MEDIA RELEASE '
            source, data = data.split(start_of_body, 1)
            data = start_of_body + data

            item['anpa_category'] = [{'qcode': 'j'}]
            item['original_source'] = 'AsiaNet'
            body_html = to_ascii(html.escape(data)).replace('\n\n', '</p><p>').replace('\n', ' ')
            item['body_html'] = '<p>' + body_html + '</p>'
            item['word_count'] = get_word_count(item['body_html'])

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
    def get_text_content(self, content):
        content = content.replace('<br>', '<br/>').replace('</br>', '')
        content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content)
        content = content.replace('\xA0', ' ')

        parsed = parse_html(content, content='html')

        for br in parsed.xpath('//br'):
            br.tail = '\r\n' + br.tail if br.tail else '\r\n'
        etree.strip_elements(parsed, 'br', with_tail=False)

        for tag in parsed.xpath('/html/div/child::*'):
            if tag.tag != 'br' and tag.text is not None and tag.text.strip() != '':
                tag.text = self.line_prefix + re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.text))
                tag.tail = '\r\n' + tag.tail if tag.tail else '\r\n'

        para_text = "".join(x for x in parsed.itertext())
        # multiple line breaks to one line break
        para_text = re.sub('[{}]+'.format(self.line_feed), self.line_feed, para_text)
        return to_ascii(para_text)
Exemple #5
0
    def _update(self, provider, update):
        self.HTTP_URL = provider.get('config', {}).get('api_url', '')
        self.provider = provider

        # Set the apikey parameter we're going to use it on all calls
        params = dict()
        params['apikey'] = provider.get('config', {}).get('apikey')

        # Use the next link if one is available in the config
        if provider.get('config', {}).get('next_link'):
            r = self.get_url(url=provider.get('config', {}).get('next_link'),
                             params=params,
                             verify=False,
                             allow_redirects=True)
            r.raise_for_status()
        else:
            id_list = provider.get('config', {}).get('productList', '').strip()
            recovery_time = provider.get('config',
                                         {}).get('recoverytime', '1').strip()
            if recovery_time == '':
                recovery_time = '1'
            start = (utcnow() - timedelta(hours=int(recovery_time))
                     ).isoformat()[:19] + 'Z'
            # If there has been a list of products defined then we format them for the request, if not all
            # allowed products will be returned.
            if id_list:
                # we remove spaces and empty values from id_list to do a clean list
                id_list = ' OR '.join(
                    [id_.strip() for id_ in id_list.split(',') if id_.strip()])
                params[
                    'q'] = 'productid:(' + id_list + ') AND mindate:>{}'.format(
                        start)
            else:
                params['q'] = 'mindate:>{}'.format(start)
            params['page_size'] = '100'

            r = self.get_url(params=params, verify=False, allow_redirects=True)
            r.raise_for_status()
        try:
            response = json.loads(r.text)
        except Exception:
            raise IngestApiError.apiRequestError(
                Exception('error parsing response'))

        nextLink = response.get('data', {}).get('next_page')
        # Got the same next link as last time so nothing new
        if nextLink == provider.get('config', {}).get('next_link'):
            logger.info('Nothing new from AP Media')
            return []

        if len(response.get('data', {}).get('items', [])) > 0:
            try:
                sequence_number = int(
                    provider.get('config', {}).get('sequence', 0))
                with ftp_connect({
                        'username':
                        provider.get('config', {}).get('ftp_user', ''),
                        'password':
                        provider.get('config', {}).get('ftp_password', ''),
                        'host':
                        provider.get('config', {}).get('ftp_server', ''),
                        'path':
                        provider.get('config', {}).get('ftp_path', '')
                }) as ftp:
                    for item in response.get('data', {}).get('items', []):
                        try:
                            if item['item']['type'] == 'picture':
                                image_ref = item['item']['renditions']['main'][
                                    'href']
                                if provider.get('config', {}).get(
                                        'filenametemplate', '') == '':
                                    filename = to_ascii(
                                        item['item']['renditions']['main']
                                        ['originalfilename'])
                                else:
                                    # The filename is generated by applying the date format string in the template
                                    filename = datetime.now().strftime(
                                        provider.get('config', {}).get(
                                            'filenametemplate', ''))
                                    # and appending the sequence number
                                    filename += '-' + str(
                                        sequence_number).zfill(4) + '.jpg'
                                    sequence_number = (sequence_number +
                                                       1) % 10000

                                logger.info(
                                    'file: {} versioncreated: {}'.format(
                                        filename,
                                        item['item']['versioncreated']))
                                r = requests.get(url=image_ref,
                                                 params={
                                                     'apikey':
                                                     provider.get(
                                                         'config',
                                                         {}).get('apikey')
                                                 })
                                r.raise_for_status()
                                try:
                                    ftp.storbinary('STOR {}'.format(filename),
                                                   BytesIO(r.content))
                                except ftplib.all_errors as e:
                                    logger.error(e)

                        # Any exception processing an indivisual item is swallowed
                        except Exception as ex:
                            logger.exception(ex)
            except Exception as ex:
                logger.exception(ex)

        # Save the link for next time
        upd_provider = provider.get('config')
        upd_provider['next_link'] = nextLink
        upd_provider['recoverytime'] = ''
        upd_provider['sequence'] = str(sequence_number)
        update['config'] = upd_provider

        return None
Exemple #6
0
 def format_text_content(tag):
     for x in tag.iter():
         if x.text is not None:
             x.text = to_ascii(x.text.strip()) + (
                 ' ' if not x.text.endswith('__##NBSP##__') else '')