/
meta2atom.py
205 lines (159 loc) · 5.67 KB
/
meta2atom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
meta2atom is a python script to generate atom feeds from the meta data
in html pages"""
import sys
import lxml.html
import feedparser
import datetime
import lxml.etree as ET
# http://henry.precheur.org/2008/9/3/RFC 3339 formatting in Python.html
from rfc3339 import rfc3339
import logging
class Page(object):
def __init__(self, filename_or_url, url):
"""extract html meta data from a filename, url or file like
object.
filename_or_url: source where to read content from
url: url for this page, this should be the url for the element
"""
self.tree = lxml.html.parse(filename_or_url)
self.meta = self.tree.findall("/head/meta")
self.url = url
def _get_meta(self, name):
name = name.lower()
for elem in self.meta:
metaname = elem.get("name", "").lower()
if metaname == name:
content = elem.get("content")
if content != "":
return content
return None
def _get_meta_title(self):
title = self.tree.find("/head/title")
if title is not None:
return title.text_content()
return None
def _get(self, *names):
"""for name in names try to get meta information.
Returns None if no meta information is found"""
for name in names:
meta = self._get_meta(name)
if meta:
return meta
return None
@property
def title(self):
title = self._get_meta_title()
if title is None:
title = self._get_meta("DC.title")
return title
@property
def author(self):
return self._get("DC.creator", "author")
@property
def description(self):
return self._get("DC.description", "description")
@property
def keywords(self):
#TODO handle DC.Subject.Keywords
keywords = self._get("DC.subject", "keywords")
if keywords:
return [x.strip() for x in keywords.split(",")]
return None
@property
def date(self):
#TODO use http://labix.org/python-dateutil instead of feedparser
d = self._get("DC.date", "date")
if d:
#TODO ??? timezone
timetuple = feedparser._parse_date(d)
return datetime.datetime(*timetuple[:7])
return None
def page2element(page, baseurl):
root = ET.Element("entry")
if page.author:
author = ET.SubElement(root, "author")
name = ET.SubElement(author,"name")
name.text = page.author
title = ET.SubElement(root, "title")
title.text = page.title
ET.SubElement(root, "link", attrib={"rel":"alternate", "type":"text/html", "href":page.url})
id = ET.SubElement(root, "id")
#TODO generate id
id.text = page.url
summary = ET.SubElement(root, "summary")
if page.description:
summary.text = page.description
else:
logging.warn("Missing summary for element %s using title instead" %page.url)
summary.text = page.title
date = ET.SubElement(root, "updated")
if page.date:
#TODO check if right format
date.text = rfc3339(page.date)
else:
logging.warn("Missing date for element %s using now() instead" %page.url)
date.text = rfc3339(datetime.datetime.now())
if page.keywords:
for keyword in page.keywords:
ET.SubElement(root, "category", attrib={"scheme":baseurl, "term":keyword})
return root
class AtomGenerator(object):
def __init__(self, baseurl, feedurl, title, name, pages, email=None, summary=None):
self.baseurl = baseurl
self.feedurl = feedurl
self.title = title
self.name = name
self.pages = pages
self.email = email
self.summary = summary
def gen_atom(self):
#pages, title, baseurl, feedurl, name, email=None, summary=None):
feed = ET.Element("feed", nsmap={None: 'http://www.w3.org/2005/Atom'})
#TODO set namespace
f_title = ET.SubElement(feed, "title")
f_title.text = self.title
f_id = ET.SubElement(feed, "id")
#TODO generate id
f_id.text = self.feedurl
f_updated = ET.SubElement(feed, "updated")
f_updated.text = rfc3339(datetime.datetime.now())
ET.SubElement(feed, "link", attrib={"href":self.feedurl, "rel":"self"})
ET.SubElement(feed, "link", attrib={"href":self.baseurl})
f_author = ET.SubElement(feed, "author")
f_name = ET.SubElement(f_author, "name")
f_name.text = self.name
if self.email:
f_email = ET.SubElement(f_author, "email")
f_email.text = self.email
if self.summary:
f_summary = ET.SubElement(feed, "summary")
f_summary.text = self.summary
for page in self.pages:
elem = page2element(page, self.baseurl)
feed.append(elem)
return feed
def to_string(self):
return lxml.etree.tostring(self.gen_atom(), pretty_print=True)
def test():
baseurl = "http://peter-hoffmann.com"
feedurl = "http://peter-hoffmann.com/test/atom.xml"
title = "Peter Hoffmanns Feed"
name = "Peter Hoffmann"
email = "tosh54@gmail.com"
import glob
pages = []
for p in glob.glob("test/*.html"):
page = Page(p, "http://peter-hoffmann.com/"+p)
pages.append(page)
generator = AtomGenerator(baseurl, feedurl, title, name, pages, email)
#feed = generator.gen_atom()
print generator.to_string()
def main(argv=None):
if argv is None:
argv = sys.argv
test()
if __name__ == "__main__":
main()