forked from iamlemec/wikidiff
/
wikidiff.py
211 lines (190 loc) · 5.65 KB
/
wikidiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# extract birth of article from wiki data
# fast parse: Liza Daly, http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
import re
import sys
import argparse
import difflib
import html
from lxml.etree import iterparse, XMLPullParser
import mwparserfromhell as mw
# parse input arguments
parser = argparse.ArgumentParser(description='USPTO patent parser.')
parser.add_argument('source', type=str, help='path to xml file to parse')
parser.add_argument('output', type=str, help='path to csv output')
parser.add_argument('--limit', type=int, default=None, help='number of articles to parse')
args = parser.parse_args()
# namespaces
ns = '{http://www.mediawiki.org/xml/export-0.10/}'
page_tag = ns + 'page'
revn_tag = ns + 'revision'
ts_tag = ns + 'timestamp'
id_tag = ns + 'id'
title_tag = ns + 'title'
ns_tag = ns + 'ns'
text_tag = ns + 'text'
# get descendent text
def get_text(parent, tag, default=''):
child = parent.find(tag)
return (child.text or default) if child is not None else default
# preserve memory
def clear(elem):
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
# revert html codes
def html_unescape(text):
text = html.unescape(text)
text = text.replace('\xa0', ' ')
return text
# recursively extract text from nodes
def parse_node(node):
t = type(node)
if t is mw.wikicode.Wikicode:
return ' '.join([parse_node(n) for n in node.nodes])
elif t is mw.nodes.Template:
name = node.name.strip()
if name.startswith('cite'):
return ' '.join([parse_node(node.get(f) if node.has(f) else '') for f in ['title', 'last1', 'last2']])
else:
return ''
elif t is mw.nodes.Wikilink:
if node.title.startswith('Image:') or node.title.startswith('Category:') or node.title.startswith('File:'):
return ''
else:
return parse_node(node.title)
elif t is mw.nodes.Heading:
return parse_node(node.title)
elif t is mw.nodes.ExternalLink:
if node.title:
return parse_node(node.title)
else:
return ''
elif t is mw.nodes.extras.Parameter:
return parse_node(node.value)
elif t is mw.nodes.Tag:
if node.tag == 'gallery' or node.contents is None:
return ''
else:
return parse_node(node.contents)
elif t is mw.nodes.Comment:
return parse_node(node.contents)
elif t is mw.nodes.HTMLEntity:
return node.normalize()
elif t is mw.nodes.Text:
return node.value
elif t is mw.nodes.Argument:
return ''
elif t is str:
return node
# elif node is None:
# return ''
else:
raise(Exception('Unrecognized Type %s: %s' % (t, node)))
def parse_wiki(wiki):
wiki = html_unescape(wiki)
tree = mw.parse(wiki)
text = parse_node(tree)
return text
# regularize to token list
def reduce_wiki(text):
text = re.sub(r'[^a-zA-Z ]', ' ', text) # remove non-text
text = re.sub(r' {2,}', ' ', text) # compress spaces again
return text.lower().strip() # to lowercase and trim
def tokenize_wiki(text):
try:
wiki = parse_wiki(text)
except Exception as e:
print()
print('PARSE ERROR')
print(text)
print()
wiki = ''
red = reduce_wiki(wiki)
return red.split()
# set up files
fin = open(args.source, encoding='utf-8')
fout = open(args.output, 'w', encoding='utf-8')
# create differ
sm = difflib.SequenceMatcher()
# this parser is bad and wrong
in_art = None
n_art = 0
text = None
for (i, line) in enumerate(fin):
if i % 1000000 == 0:
print(i)
ret = re.match('( *)<([^>]*?)>', line)
if ret:
(ind, tag) = ret.groups()
ind = len(ind)
body = line[ret.end():]
ret = re.match('([^<]*?)</[^>]*?>', body)
if ret:
(body,) = ret.groups()
oner = True
else:
oner = False
else:
tag = None
if text is not None:
if line.endswith('</text>\n'):
text += line[:-8]
try:
toks = tokenize_wiki(text)
except:
print('PARSE ERROR: %s, %s, %s' % (aid, rid, title))
toks = []
text = None
else:
text += line
continue
if tag == '/page':
if in_art:
n_art += 1
if args.limit and n_art >= args.limit:
break
in_art = None
if in_art == False:
continue
if tag == 'page':
in_art = None
last_toks = []
elif tag == 'ns':
if body == '0':
print(title)
in_art = True
else:
in_art = False
elif tag == 'id':
if ind == 4:
aid = body
elif ind == 6:
rid = body
elif tag == 'title':
title = body
elif tag == 'timestamp':
date = body
elif tag.startswith('text'):
if oner:
try:
toks = tokenize_wiki(body)
except:
print('PARSE ERROR: %s, %s, %s' % (aid, rid, title))
toks = []
text = None
else:
text = body
elif tag == '/revision':
if toks is None:
revn = []
sm.set_seqs(last_toks, toks)
plus = []
for (op, s1, e1, s2, e2) in sm.get_opcodes():
if op == 'insert' or op == 'replace':
plus += toks[s2:e2]
if len(plus) > 0:
fout.write('%s,%s,%s,%s,%s,"%s"\n' % (aid, rid, date, len(toks), len(plus), ' '.join(plus)))
last_toks = toks
# clean up
fout.close()
print(n_art)