forked from funginstitute/patentprocessor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
consolidate.py
executable file
·80 lines (76 loc) · 3.6 KB
/
consolidate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
"""
Takes the existing database (as indicated by the alchemy configuration file) and creates
a dump CSV file with the appropriate columns as needed for the disambiguation:
patent doc number, main class, sub class, inventor first name, inventor middle name, inventor last name,
city, state, zipcode, country, assignee
"""
import codecs
from lib import alchemy
from lib.assignee_disambiguation import get_assignee_id
from lib.handlers.xml_util import normalize_utf8
from sqlalchemy.orm import joinedload, subqueryload
from sqlalchemy import extract
from datetime import datetime
import sys
# create CSV file row using a dictionary. Use `ROW(dictionary)`
ROW = lambda x: u'{uuid}\t{name_first}\t{name_middle}\t{name_last}\t{number}\t{mainclass}\t{subclass}\t{city}\t{state}\t{country}\t{assignee}\t{rawassignee}\n'.format(**x)
def main(year, doctype='grant'):
# get patents as iterator to save memory
# use subqueryload to get better performance by using less queries on the backend:
# --> http://docs.sqlalchemy.org/en/latest/orm/tutorial.html#eager-loading
session = alchemy.fetch_session(dbtype=doctype)
schema = alchemy.schema.Patent
if doctype == 'application':
schema = alchemy.schema.App_Application
if year:
patents = (p for p in session.query(schema).filter(extract('year', schema.date) == gyear).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1))
else:
patents = (p for p in session.query(schema).options(subqueryload('rawinventors'), subqueryload('rawassignees'), subqueryload('classes')).yield_per(1))
i = 0
for patent in patents:
i += 1
if i % 100000 == 0:
print i, datetime.now()
try:
# create common dict for this patent
loc = patent.rawinventors[0].rawlocation
mainclass = patent.classes[0].mainclass_id if patent.classes else ''
subclass = patent.classes[0].subclass_id if patent.classes else ''
row = {'number': patent.number,
'mainclass': mainclass,
'subclass': subclass,
'state': loc.state if loc else '',
'country': loc.country if loc else '',
'city': loc.city if loc else '',
}
row['assignee'] = get_assignee_id(patent.assignees[0]) if patent.assignees else ''
row['rawassignee'] = get_assignee_id(patent.rawassignees[0]) if patent.rawassignees else ''
# generate a row for each of the inventors on a patent
for ri in patent.rawinventors:
namedict = {'name_first': ri.name_first, 'uuid': ri.uuid}
raw_name = ri.name_last.split(' ')
# name_last is the last space-delimited word. Middle name is everything before that
name_middle, name_last = ' '.join(raw_name[:-1]), raw_name[-1]
namedict['name_middle'] = name_middle
namedict['name_last'] = name_last
tmprow = row.copy()
tmprow.update(namedict)
newrow = normalize_utf8(ROW(tmprow))
with codecs.open('disambiguator.csv', 'a', encoding='utf-8') as csv:
csv.write(newrow)
except Exception as e:
print e
continue
if __name__ == '__main__':
if len(sys.argv) < 2:
main(None)
elif len(sys.argv) < 3:
doctype = sys.argv[1]
print('Running ' + doctype)
main(None, doctype)
else:
gyear = sys.argv[2]
doctype = sys.argv[1]
print('Running ' + str(gyear) + ' ' + doctype)
main(gyear, doctype)