Example #1
0
 def __init__(self,
              feed_path=None,
              metadata_path=None,
              description=None):
     self.config = Config()
     self.source_data = self.config.path.get('source_data', None)
     self.process_date = arrow.now().format('YYYY_MM_DD')
     self.description = description
     self.reader = FeedReader(feed_path=feed_path)
     self.feed = self.reader.read_feed()
     self.feed_metadata = metadata_path
     self.metadata = None
     self.travel_feed = None
Example #2
0
class TravelFeedReader(object):

    def __init__(self,
                 feed_path=None,
                 metadata_path=None,
                 description=None):
        self.config = Config()
        self.source_data = self.config.path.get('source_data', None)
        self.process_date = arrow.now().format('YYYY_MM_DD')
        self.description = description
        self.reader = FeedReader(feed_path=feed_path)
        self.feed = self.reader.read_feed()
        self.feed_metadata = metadata_path
        self.metadata = None
        self.travel_feed = None

    def process(self):
        self.read_feed_metadata(metadata_path=self.feed_metadata)
        self.read_feed()
        self.save_feed(
            feed=self.travel_feed['metadata'],
            field_names=['title', 'subtitle', 'value', 'description',
                         'source', 'etag', 'feed_id', 'created', 'modified'],
            file='{source_data}/travel_{description}_feed_metadata_{date}.csv'.format(
                source_data=self.source_data,
                description=self.description,
                date=self.process_date,
            )
        )
        self.save_feed(
            feed=self.travel_feed['entries'],
            field_names=['id', 'title', 'published', 'source', 'dc_identifier',
                         'summary', 'links', 'feed_id', 'created', 'modified'],
            file='{source_data}/travel_{description}_feed_entries_{date}.csv'.format(
                source_data=self.source_data,
                date=self.process_date,
                description=self.description,
            )
        )
        self.save_feed(
            feed=[self.feed_metadata],
            field_names=['identifier', 'title', 'description', 'published', 'publisher_name',
                         'source', 'bureau_code', 'start', 'end', 'modified'],
            file='{source_data}/travel_{description}_feed_{date}.csv'.format(
                source_data=self.source_data,
                date=self.process_date,
                description=self.description,
            )
        )

    def read_feed(self):
        travel_feed = dict()
        travel_feed['metadata'] = self.get_metadata()
        travel_feed['entries'] = self.get_parsed_entries(entries=self.feed['entries'])
        self.travel_feed = travel_feed

    def get_metadata(self):
        metadata = self.feed['metadata']
        metadata['created'] = arrow.now().format('YYYY-MM-DD HH:mm:ss')
        metadata['modified'] = arrow.now().format('YYYY-MM-DD HH:mm:ss')
        metadata['feed_id'] = self.feed_metadata['identifier']
        self.metadata = metadata
        return [metadata]

    def get_parsed_entries(self, entries=None):
        parsed_entries = list()
        for entry in entries:
            parsed_entries.append(self.parse_entry(entry))
        return parsed_entries

    def parse_entry(self, entry=None):
        parsed_entry = dict()
        parsed_entry['id'] = entry.id
        parsed_entry['title'] = entry.title
        parsed_entry['published'] = arrow.get(entry.published_parsed).format('YYYY-MM-DD HH:mm:ss')
        parsed_entry['source'] = entry.link
        parsed_entry['dc_identifier'] = entry.dc_identifier
        parsed_entry['summary'] = self.parse_summary_detail(summary_detail=entry.summary_detail)
        parsed_entry['links'] = self.get_parsed_summary_links(value=entry.summary_detail.value)
        parsed_entry['created'] = arrow.now().format('YYYY-MM-DD HH:mm:ss')
        parsed_entry['modified'] = arrow.now().format('YYYY-MM-DD HH:mm:ss')
        parsed_entry['feed_id'] = self.feed_metadata['identifier']
        return parsed_entry

    def parse_summary_detail(self, summary_detail=None):
        parsed_summary = dict()
        parsed_summary['value'] = self.parse_summary_value(value=summary_detail.value)
        return parsed_summary

    def parse_summary_value(self, value=None):
        parsed_value = list()
        try:
            soup = BeautifulSoup(value, 'lxml')
            for p in soup.find_all('p'):
                if p.text:
                    parsed_value.append(
                        remove_escape_characters(value=p.text).strip()
                    )
            return '\n'.join(parsed_value)
        except Exception as e:
            logger.error('Not valid xml format', e)

    def get_parsed_summary_links(self, value=None):
        parsed_links = list()
        try:
            soup = BeautifulSoup(value, 'lxml')
            for a in soup.find_all('a'):
                link = dict()
                link['title'] = a.text.strip()
                link['link'] = self.parse_link(link=a['href'])
                parsed_links.append(link)
            return parsed_links
        except Exception as e:
            logger.error('Not valid xml format', e)

    def parse_link(self, link=None):
            parsed_link = parse_url(link).__dict__
            formatted_link = []
            if not parsed_link['scheme']:
                formatted_link.append('http')
            else:
                formatted_link.append(parsed_link['scheme'])
            if not parsed_link['netloc']:
                formatted_link.append('travel.state.gov')
            else:
                formatted_link.append(parsed_link['netloc'])
            formatted_link.extend([
                parsed_link['path'],
                parsed_link['params'],
                parsed_link['query'],
                parsed_link['fragment']
            ])
            return build_url_from_parts(formatted_link)

    def read_feed_metadata(self, metadata_path=None):
        with open(metadata_path) as metadata_json:
            metadata = json.loads(metadata_json.read())
            feed_metadata = dict()
            feed_metadata['title'] = metadata['title']
            feed_metadata['source'] = metadata['describedBy']
            feed_metadata['publisher_name'] = metadata['publisher']['name']
            feed_metadata['bureau_code'] = metadata['bureauCode']
            feed_metadata['start'], feed_metadata['end'] = self.parse_temporal(metadata['temporal'])
            feed_metadata['identifier'] = metadata['identifier']
            feed_metadata['modified'] = metadata['modified']
            feed_metadata['description'] = metadata['description']
            self.feed_metadata = feed_metadata

    def parse_temporal(self, temporal=None):
        start = arrow.get(temporal.split('/')[0]).format('YYYY-MM-DD HH:mm:ss')
        try:
            end = arrow.get(temporal.split('/')[1]).format('YYYY-MM-DD HH:mm:ss')
        except:
            end = 'Null'
        return start, end

    def save_feed(self, file=None, feed=None, field_names=None):
        save_rows(file=file, rows=feed, field_names=field_names)
        pass