Exemple #1
0
    def _retrieve_and_parse(self, ftp, config, filename, provider,
                            registered_parser):
        items = []

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')
        local_file_path = os.path.join(config['dest_path'], filename)

        with open(local_file_path, 'wb') as f:
            try:
                ftp.retrbinary('RETR %s' % filename, f.write)
            except ftplib.all_errors:
                os.remove(local_file_path)
                raise Exception(
                    'Exception retrieving file from FTP server ({filename})'.
                    format(filename=filename))

        if isinstance(registered_parser, XMLFeedParser):
            xml = etree.parse(local_file_path).getroot()
            parser = self.get_feed_parser(provider, xml)
            parsed = parser.parse(xml, provider)
        else:
            parser = self.get_feed_parser(provider, local_file_path)
            parsed = parser.parse(local_file_path, provider)

        if isinstance(parsed, dict):
            parsed = [parsed]

        items.append(parsed)
        return items
Exemple #2
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        items = []
        with ftplib.FTP(config.get('host')) as ftp:
            ftp.login(config.get('username'), config.get('password'))
            ftp.cwd(config.get('path', ''))

            for filename, facts in ftp.mlsd():
                if not filename.endswith(self.FILE_SUFFIX):
                    continue

                if last_updated:
                    item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                    if item_last_updated < last_updated:
                        continue

                dest = '%s/%s' % (config['dest_path'], filename)

                try:
                    with open(dest, 'xb') as f:
                        ftp.retrbinary('RETR %s' % filename, f.write)
                except FileExistsError:
                    continue

                xml = etree.parse(dest).getroot()
                items.append(get_xml_parser(xml).parse_message(xml))
        return items
Exemple #3
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        crt_last_updated = None

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue
                        elif not crt_last_updated or item_last_updated > crt_last_updated:
                            crt_last_updated = item_last_updated

                    local_file_path = os.path.join(config['dest_path'], filename)
                    try:
                        with open(local_file_path, 'xb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception('Exception retrieving from FTP server')
                                continue
                    except FileExistsError:
                        logger.exception('Exception retrieving from FTP server, file already exists')
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider, local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #4
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    local_file_path = os.path.join(config['dest_path'],
                                                   filename)
                    try:
                        with open(local_file_path, 'xb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception(
                                    'Exception retrieving from FTP server')
                                continue
                    except FileExistsError:
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider,
                                                      local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #5
0
    def _update(self, provider):
        config = provider.get("config", {})
        last_updated = provider.get("last_updated")

        if "dest_path" not in config:
            config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_")

        try:
            with ftplib.FTP(config.get("host")) as ftp:
                ftp.login(config.get("username"), config.get("password"))
                ftp.cwd(config.get("path", ""))
                ftp.set_pasv(config.get("passive", False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get("type", "") != "file":
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts["modify"], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    local_file_path = os.path.join(config["dest_path"], filename)
                    try:
                        with open(local_file_path, "xb") as f:
                            try:
                                ftp.retrbinary("RETR %s" % filename, f.write)
                            except ftplib.all_errors as ex:
                                os.remove(local_file_path)
                                logger.exception("Exception retrieving from FTP server")
                                continue
                    except FileExistsError:
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider, local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #6
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        items = []
        try:
            with ftplib.FTP(config.get('host')) as ftp:
                ftp.login(config.get('username'), config.get('password'))
                ftp.cwd(config.get('path', ''))
                ftp.set_pasv(config.get('passive', False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(
                            facts['modify'],
                            self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(
                            Exception('Parser not found'), provider, filename)
                    parsed = parser.parse_message(xml, provider)
                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #7
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftplib.FTP(config.get('host')) as ftp:
                ftp.login(config.get('username'), config.get('password'))
                ftp.cwd(config.get('path', ''))
                ftp.set_pasv(config.get('passive', False))

                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    local_file_path = os.path.join(config['dest_path'], filename)
                    try:
                        with open(local_file_path, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    registered_parser = self.get_feed_parser(provider)
                    if isinstance(registered_parser, XMLFeedParser):
                        xml = etree.parse(local_file_path).getroot()
                        parser = self.get_feed_parser(provider, xml)
                        parsed = parser.parse(xml, provider)
                    else:
                        parser = self.get_feed_parser(provider, local_file_path)
                        parsed = parser.parse(local_file_path, provider)

                    if isinstance(parsed, dict):
                        parsed = [parsed]

                    items.append(parsed)
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #8
0
    def _update(self, provider):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        items = []
        try:
            with ftplib.FTP(config.get('host')) as ftp:
                ftp.login(config.get('username'), config.get('password'))
                ftp.cwd(config.get('path', ''))

                items = self._get_items(ftp)
                for filename, facts in items:
                    if facts.get('type', '') != 'file':
                        continue

                    if not filename.lower().endswith(self.FILE_SUFFIX):
                        continue

                    if last_updated:
                        item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                        if item_last_updated < last_updated:
                            continue

                    dest = os.path.join(config['dest_path'], filename)

                    try:
                        with open(dest, 'xb') as f:
                            ftp.retrbinary('RETR %s' % filename, f.write)
                    except FileExistsError:
                        continue

                    xml = etree.parse(dest).getroot()
                    parser = get_xml_parser(xml)
                    if not parser:
                        raise IngestFtpError.ftpUnknownParserError(Exception('Parser not found'),
                                                                   provider, filename)
                    items.append(parser.parse_message(xml, provider))
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #9
0
    def _retrieve_and_parse(self, ftp, config, filename, provider, registered_parser):
        self._timer.start('retrieve_parse')

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')
        local_file_path = os.path.join(config['dest_path'], filename)

        with open(local_file_path, 'wb') as f:
            try:
                ftp.retrbinary('RETR %s' % filename, f.write)
                self._log_msg(
                    "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}.".format(
                        self._timer.split('retrieve_parse'),
                        os.path.getsize(local_file_path),
                        filename
                    )
                )
            except ftplib.all_errors:
                self._log_msg(
                    "Download failed. Exec time: {:.4f} secs. File: {}.".format(
                        self._timer.stop('retrieve_parse'),
                        filename
                    )
                )
                os.remove(local_file_path)
                raise Exception('Exception retrieving file from FTP server ({filename})'.format(
                                filename=filename))

        if isinstance(registered_parser, XMLFeedParser):
            xml = etree.parse(local_file_path).getroot()
            parser = self.get_feed_parser(provider, xml)
            parsed = parser.parse(xml, provider)
        else:
            parser = self.get_feed_parser(provider, local_file_path)
            parsed = parser.parse(local_file_path, provider)

        self._log_msg(
            "Parsing finished. Exec time: {:.4f} secs. File: {}.".format(
                self._timer.stop('retrieve_parse'),
                filename
            )
        )

        return [parsed] if isinstance(parsed, dict) else parsed
Exemple #10
0
    def _retrieve_and_parse(self, ftp, config, filename, provider,
                            registered_parser):
        self._timer.start("retrieve_parse")

        if "dest_path" not in config:
            config["dest_path"] = tempfile.mkdtemp(prefix="superdesk_ingest_")
        local_file_path = os.path.join(config["dest_path"], filename)

        with open(local_file_path, "wb") as f:
            try:
                ftp.retrbinary("RETR %s" % filename, f.write)
                self._log_msg(
                    "Download finished. Exec time: {:.4f} secs. Size: {} bytes. File: {}."
                    .format(self._timer.split("retrieve_parse"),
                            os.path.getsize(local_file_path), filename))
            except ftplib.all_errors:
                self._log_msg(
                    "Download failed. Exec time: {:.4f} secs. File: {}.".
                    format(self._timer.stop("retrieve_parse"), filename))
                os.remove(local_file_path)
                raise Exception(
                    "Exception retrieving file from FTP server ({filename})".
                    format(filename=filename))

        if self._is_empty(local_file_path):
            logger.info(
                "ignoring empty file {filename}".format(filename=filename))
            raise EmptyFile(local_file_path)

        if isinstance(registered_parser, XMLFeedParser):
            xml = etree.parse(local_file_path).getroot()
            parser = self.get_feed_parser(provider, xml)
            parsed = parser.parse(xml, provider)
        else:
            parser = self.get_feed_parser(provider, local_file_path)
            parsed = parser.parse(local_file_path, provider)

        self._log_msg(
            "Parsing finished. Exec time: {:.4f} secs. File: {}.".format(
                self._timer.stop("retrieve_parse"), filename))

        return [parsed] if isinstance(parsed, dict) else parsed
Exemple #11
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        registered_parser = self.get_feed_parser(provider)
        try:
            allowed_ext = registered_parser.ALLOWED_EXT
        except AttributeError:
            allowed_ext = self.ALLOWED_EXT_DEFAULT
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(
                config.get('path', ''),
                config.get('move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(
                config.get('path', ''),
                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning(
                            "Can't create move directory, files will not be moved: {reason}"
                            .format(reason=e))
                        do_move = False
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue
                    try:
                        if not self._is_allowed(filename, allowed_ext):
                            logger.info(
                                'ignoring file {filename} because of file extension'
                                .format(filename=filename))
                            continue

                        if last_updated:
                            item_last_updated = datetime.strptime(
                                facts['modify'],
                                self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated <= last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        local_file_path = os.path.join(config['dest_path'],
                                                       filename)
                        with open(local_file_path, 'wb') as f:
                            try:
                                ftp.retrbinary('RETR %s' % filename, f.write)
                            except ftplib.all_errors:
                                os.remove(local_file_path)
                                raise Exception(
                                    'Exception retrieving file from FTP server ({filename})'
                                    .format(filename=filename))

                        if isinstance(registered_parser, XMLFeedParser):
                            xml = etree.parse(local_file_path).getroot()
                            parser = self.get_feed_parser(provider, xml)
                            parsed = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(
                                provider, local_file_path)
                            parsed = parser.parse(local_file_path, provider)

                        if isinstance(parsed, dict):
                            parsed = [parsed]

                        items.append(parsed)
                        if do_move:
                            move_dest_file_path = os.path.join(
                                move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error(
                            "Error while parsing {filename}: {msg}".format(
                                filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(
                                move_dest_path_error, filename)
                            self._move(ftp, filename,
                                       move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)
Exemple #12
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        last_updated = provider.get('last_updated')
        crt_last_updated = None
        if config.get('move', False):
            do_move = True
            if not config.get('move_path'):
                logger.debug('missing move_path, default will be used')
            move_dest_path = os.path.join(config.get('path', ''), config.get('move_path') or DEFAULT_SUCCESS_PATH)
            if not config.get('move_path_error'):
                logger.debug('missing move_path_error, default will be used')
            move_dest_path_error = os.path.join(config.get('path', ''),
                                                config.get('move_path_error') or DEFAULT_FAILURE_PATH)
        else:
            do_move = False

        if 'dest_path' not in config:
            config['dest_path'] = tempfile.mkdtemp(prefix='superdesk_ingest_')

        try:
            with ftp_connect(config) as ftp:
                if do_move:
                    try:
                        self._create_if_missing(ftp, move_dest_path)
                        self._create_if_missing(ftp, move_dest_path_error)
                    except ftplib.all_errors as e:
                        logger.warning("Can't create move directory, files will not be moved: {reason}".format(
                            reason=e))
                        do_move = False
                items = []
                for filename, facts in ftp.mlsd():
                    if facts.get('type', '') != 'file':
                        continue
                    try:
                        if not filename.lower().endswith(self.FILE_SUFFIX):
                            raise

                        if last_updated:
                            item_last_updated = datetime.strptime(facts['modify'], self.DATE_FORMAT).replace(tzinfo=utc)
                            if item_last_updated < last_updated:
                                continue
                            elif not crt_last_updated or item_last_updated > crt_last_updated:
                                crt_last_updated = item_last_updated

                        local_file_path = os.path.join(config['dest_path'], filename)
                        try:
                            with open(local_file_path, 'xb') as f:
                                try:
                                    ftp.retrbinary('RETR %s' % filename, f.write)
                                except ftplib.all_errors as ex:
                                    os.remove(local_file_path)
                                    raise Exception('Exception retrieving file from FTP server ({filename})'.format(
                                                    filename=filename))
                        except FileExistsError as e:
                            raise Exception('Exception retrieving from FTP server, file already exists ({filename])'
                                            .format(filename=local_file_path))

                        registered_parser = self.get_feed_parser(provider)
                        if isinstance(registered_parser, XMLFeedParser):
                            xml = etree.parse(local_file_path).getroot()
                            parser = self.get_feed_parser(provider, xml)
                            parsed = parser.parse(xml, provider)
                        else:
                            parser = self.get_feed_parser(provider, local_file_path)
                            parsed = parser.parse(local_file_path, provider)

                        if isinstance(parsed, dict):
                            parsed = [parsed]

                        items.append(parsed)
                        if do_move:
                            move_dest_file_path = os.path.join(move_dest_path, filename)
                            self._move(ftp, filename, move_dest_file_path)
                    except Exception as e:
                        logger.error("Error while parsing {filename}: {msg}".format(filename=filename, msg=e))
                        if do_move:
                            move_dest_file_path_error = os.path.join(move_dest_path_error, filename)
                            self._move(ftp, filename, move_dest_file_path_error)
            if crt_last_updated:
                update[LAST_UPDATED] = crt_last_updated
            return items
        except IngestFtpError:
            raise
        except Exception as ex:
            raise IngestFtpError.ftpError(ex, provider)