-
Notifications
You must be signed in to change notification settings - Fork 0
/
news.py
238 lines (203 loc) · 10.4 KB
/
news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#/usr/bin/python3
import logging, pathlib, lxml.etree, lxml.html
from collections import Counter
from datetime import datetime
from zipfile import ZipFile
from shutil import copy2
class SafDriver:
def __init__(self, input_files, path=None, log_level=logging.DEBUG):
# Set base path if it doesn't exist.
if path is not None and pathlib.Path(path).expanduser().exists():
self.base_path = pathlib.Path(path).expanduser()
if not self.base_path.is_absolute():
self.base_path = self.base_path.resolve()
else:
self.base_path = pathlib.Path(__file__).parent
# Set up logging
self.log_level = log_level
log_path = self.base_path / 'vt_news.log'
logging.basicConfig(filename=str(log_path), level=log_level)
# Create package
self.create_saf_package(input_files)
def create_saf_package(self, input_files):
author_counter, spatial_counter = Counter(), Counter()
for rss_file in input_files:
logging.debug('Parsing {}'.format(rss_file))
feed = lxml.etree.parse(str(self.base_path / rss_file))
for item in self.find_items(feed):
news_item = NewsItem(item, self.base_path)
author_counter[news_item.author] += 1
spatial_counter[news_item.spatial_coverage] += 1
news_item.create_saf_item()
logging.info('')
if self.log_level <= logging.DEBUG:
logging.debug('Instrumentation: Authors: {}'.format(author_counter))
logging.debug('Instrumentation: Spatial: {}'.format(spatial_counter))
def find_items(self, feed):
items = feed.findall('.//item')
# Return all items, unless we're below debug mode. Then, return first, middle, and last.
if self.log_level < logging.DEBUG:
return [items[0], items[len(items)//2], items[-1]]
return items
class NewsItem:
__slots__ = ['author', 'spatial_coverage', 'date_issued', 'description', 'title', 'feed_url',
'image_urls', 'keywords', 'error_urls', 'base_path', 'directory']
def __getattr__(self, name):
return None
def __init__(self, item=None, path=None):
self.keywords = set()
self.image_urls = set()
self.error_urls = set()
# Set base path if it doesn't exist.
if path is not None and pathlib.Path(path).expanduser().exists():
self.base_path = pathlib.Path(path).expanduser()
if not self.base_path.is_absolute():
self.base_path = self.base_path.resolve()
else:
self.base_path = pathlib.Path(__file__).parent
# Parse the item, if given.
if item is not None:
self.parse(item)
def parse(self, item):
self.parse_atom(item)
if self.feed_url is not None:
self.parse_html(self.feed_url)
def parse_atom(self, item):
# Find and add metadata.
self.title = item.find('./title').text
logging.debug('Title: {}'.format(self.title))
self.feed_url = item.find('./link').text
logging.debug('Feed Url: {}'.format(self.feed_url))
self.description = item.find('./description').text
logging.debug('Description: {}'.format(self.description))
self.date_issued = item.find('./pubDate').text
self.date_issued = datetime.strptime(self.date_issued, '%a, %d %b %Y %H:%M:%S %z')
logging.debug('Date Issued: {}'.format(self.date_issued.strftime('%Y-%m-%d')))
for keyword in item.iterfind('./category'):
self.keywords.add(item.find('./category').text)
if len(self.keywords) > 0:
logging.debug('Keywords: {}'.format(self.keywords))
for enclosure in item.iterfind('./enclosure'):
self.image_urls.add(enclosure.get('url'))
if len(self.image_urls) > 0:
logging.debug('Images from feed: {}'.format(self.image_urls))
def parse_html(self, url):
page = url.split('articles/')[-1]
if self.base_path.joinpath(page).exists():
html = lxml.html.parse(page)
logging.info('HTML page `{}` exists, and parses.'.format(url))
# Dateline is in the first p, unless that is an image, then it is in the third.
dateline = html.find('.//{*}p')
if dateline.text is None:
dateline = html.findall('.//{*}p')[2]
if 'BLACKSBURG, Va.' in dateline.text:
self.spatial_coverage = 'Blacksburg, Va.'
else:
date_issued = self.date_issued.strftime(', %b')
self.spatial_coverage = dateline.text.split(date_issued)[0].title()
if len(self.spatial_coverage) > 25 or '\n' in self.spatial_coverage or ' ' == self.spatial_coverage:
# Sanity check: These are symptoms of errors. Change them to Blacksburg.
self.spatial_coverage = 'Blacksburg, Va.'
logging.debug('Spatial Coverage: {}'.format(self.spatial_coverage))
# Author is in the first li of the last ul, or the one before that, if it exists.
html_lists = html.findall('.//{*}ul')
author = html_lists[-1].find('./{*}li').text
if author is None:
try:
author = html_lists[-2].find('./{*}li').text
except IndexError as e:
logging.error('No author found.')
if author is not None:
author = ' '.join(author.split())
self.author = author
logging.debug('Author: {}'.format(self.author))
# Any img tag is a related file.
for image in html.iterfind('.//{*}img'):
self.image_urls.add(image.get('src'))
if len(self.image_urls) > 0:
logging.debug('All image urls: {}'.format(self.image_urls))
else:
logging.error('Url `{}` does not map to an HTML file in the archive.'.format(url))
self.error_urls.add(url)
def create_saf_item(self):
if self.feed_url not in self.error_urls:
# Create needed directories
self.directory = self.base_path / 'vt-news-saf' / pathlib.Path(self.feed_url).stem
self.directory.mkdir(parents=True, exist_ok=True)
logging.debug('Created a directory: {}'.format(self.directory))
# Add files to directory
html_path = self.base_path / self.feed_url.split('articles/')[-1]
copy2(str(html_path), str(self.directory))
self.copy_images()
logging.debug('All files copied')
if len(self.error_urls) > 0:
logging.info('Urls that created errors: {}'.format(self.error_urls))
# Write the contents file and the DC metadata file.
self.write_contents()
self.write_metadata()
logging.info('Created SAF item.')
# Add all files to the zipped package.
self.add_to_package()
logging.info('Added item to zipped package.')
else:
logging.error('No valid HTML page was found for this item! Cannot create SAF package.')
def copy_images(self):
# Deduplicate and standardize self.image_urls
image_path_partials = set()
for image_url in self.image_urls:
image_path_partial = None
if 'articles/' in image_url:
image_path_partial = image_url.split('articles/')[-1]
else:
if 'http' not in image_url:
date_issued = self.date_issued.strftime('%Y/%m')
image_path_partial = '{}/{}'.format(date_issued, image_url)
else:
logging.error('Image url `{}` not in news archive. Skipped.'.format(image_url))
if image_path_partial is not None:
image_path_partials.add(image_path_partial)
if len(image_path_partials) > 0:
logging.debug('Canonical image urls: {}'.format(image_path_partials))
# Add images to SAF location.
for image_path_partial in image_path_partials:
image_path = self.base_path / image_path_partial
if image_path.exists():
copy2(str(image_path), str(self.directory))
else:
logging.error('No file found at `{}`. Expected an image.'.format(image_path))
self.error_urls.add(image_path)
def write_contents(self):
# Add all files that are not the contents file to the contents file, each on a line.
with self.directory.joinpath('contents').open(mode='wt', encoding='utf-8') as contents:
for item in self.directory.iterdir():
if item.is_file() and item.name != 'contents':
contents.write('{}\n'.format(item.name))
def write_metadata(self):
# Create the XML structure
dc_root = lxml.etree.Element('dublin_core')
dc_root.append(DCValue('title', value=self.title))
dc_root.append(DCValue('description', qualifier='abstract', value=self.description))
dc_root.append(DCValue('contributor', qualifier='author', value=self.author))
dc_root.append(DCValue('publisher', value='Virginia Tech. University Relations'))
date_issued = self.date_issued.strftime('%Y-%m-%d')
dc_root.append(DCValue('date', qualifier='issued', value=date_issued))
dc_root.append(DCValue('coverage', qualifier='spatial', value=self.spatial_coverage))
for keyword in self.keywords:
dc_root.append(DCValue('subject', value=keyword))
# Write it to dublin_core.xml
with self.directory.joinpath('dublin_core.xml').open(mode='wt', encoding='utf-8') as dc_file:
dc_file.write(lxml.etree.tostring(dc_root, encoding=str, pretty_print=True))
def add_to_package(self):
zip_path = self.base_path / 'vt-news-saf.zip'
with ZipFile(str(zip_path), 'a') as zip_file:
for item in self.directory.iterdir():
zip_file.write(str(item), str(item.relative_to(self.base_path)))
def DCValue(element, qualifier='none', value=None, language='en_US'):
# Return an dcvalue element, with it's text and attributes set.
element = lxml.etree.Element('dcvalue', element=element, qualifier=qualifier)
element.text = value
if language is not None:
element.set('language', language)
return element
if __name__ == '__main__':
SafDriver(['2006.xml', '2007.xml', '2008.xml', '2009.xml', '2010.xml'])