-
Notifications
You must be signed in to change notification settings - Fork 0
/
mining.py
137 lines (113 loc) · 3.71 KB
/
mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import feedparser
import os
import datetime
from BeautifulSoup import BeautifulSoup as Soup
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from lxml import etree
from StringIO import StringIO
import HTMLParser
import re
from story import Story
from optparse import OptionParser
def strip_tags(html):
try:
if html:
html2 = HTMLParser.HTMLParser().unescape(html)
return re.sub('<[^<]+?>', '', html2)
else:
return ''
except Exception, e:
print '-' * 80
print 'exception'
print e
print html
print '-' * 80
class NSTree(object):
@classmethod
def load(cls, path):
content = open(path).read()
parser = etree.XMLParser(recover=True)
tree = etree.parse(StringIO(content), parser)
return NSTree(tree.getroot(), { 'n': tree.getroot().nsmap[None] })
def __init__(self, node, ns):
self.node = node
self.ns = ns
def __getattr__(self, name):
attr = self.node.__getattribute__(name)
return attr
#if name == 'xpath' and hasattr(attr, '__call__'):
# def newfunc(*args, **kwargs):
# print('before calling %s' %attr.__name__)
# result = attr(*args, **kwargs)
# print('done calling %s' %attr.__name__)
# return result
# return newfunc
#else:
# return attr
# TODO: is there a better way?
def xpath(self, path):
path = './/n:%s' % path
result = self.node.xpath(path, namespaces=self.ns)
if (isinstance(result, list)):
return [NSTree(item, self.ns) for item in result]
else:
return NSTree(result, self.ns)
class GRXMLFeed:
def __init__(self, path):
print path
starttime = datetime.datetime.now()
self.root = NSTree.load(path)
endtime = datetime.datetime.now()
print '-' * 80
print('(%s-%s) : %s' % (starttime, endtime, endtime-starttime))
def make_tag(self, tag):
return '{%s}%s' % (ns, tag)
def extract_story(self, e):
link = e.xpath('link')
#if link: print link[0].attrib['href'] #get node attrib
url = link and link[0].get('href', '')
if not url: return None
story = Story(url)
title = e.xpath('title')
story.title = title and title[0].text
summary = e.xpath('summary') or e.xpath('content')
story.summary = strip_tags(summary and summary[0].text)
categories = e.xpath('category')
published = e.xpath('published')
updated = e.xpath('updated')
author = e.xpath('author') # compound data: name, etc.
#if author:
# name = author[0].xpath('name')
# if name: print name[0].text
source = e.xpath('source') # compound data
return story
def extract_stories(self):
print 'extract_stories'
return [self.extract_story(e) for e in self.root.xpath('entry')]
def process_file(filename):
stories = GRXMLFeed(filename).extract_stories()
for story in stories:
story.save()
def process(path):
if not os.path.exists(path):
print '%s not exists' % path
return
if os.path.isfile(path):
return process_file(path)
for dir_path, subpaths, filenames in os.walk(path):
for filename in filenames:
process_file(os.path.join(dir_path, filename))
if __name__ == '__main__':
parser = OptionParser()
parser.add_option('-i', "--input", dest="path", help="input file, or path")
parser.add_option("-l", "--list", action="store_true", dest="list_stories", default=False, help="list stories")
parser.add_option("-p", "--parse", action="store_true", dest="parse_stories", default=False, help="parse stories from input")
(options, args) = parser.parse_args()
if options.parse_stories:
process(options.path)
elif options.list_stories:
stories = Story.get_all()
for story in stories:
if not story.url:
print '-' * 80
print 'no link'