def Stats(self, patns, nQuarters): '''Collect all the statistics needed''' for patn in patns.itervalues(): if patn.pno % 100000 == 0: print '\r', patn.pno, isq = patn.isq self.nPatns[isq] += 1 self.nRawCitesMade[isq] += len(patn.rawcites) self.nCitesMade[isq] += len(patn.cites) self.nCitesRecd[isq] += len(patn.citedby) # activity stuff self.totAct[isq] = [a+b for a,b in zip(self.totAct[isq],patn.Activity(patns))] self.totActApd[isq] = [a+b for a,b in zip(self.totActApd[isq],patn.ActivityApd(patns))] print for isq in range(nQuarters): nPatns = self.nPatns[isq] if nPatns < 1: # mostly for testing continue self.avgRawCitesMade[isq] = 1.0 * self.nRawCitesMade[isq] / nPatns self.avgCitesMade[isq] = 1.0 * self.nCitesMade[isq] / nPatns self.avgCitesRecd[isq] = 1.0 * self.nCitesRecd[isq] / nPatns for i in range(nQuarters): self.avgAct[isq][i] = 1.0 * self.totAct[isq][i] / nPatns self.avgActApd[isq][i] = 1.0 * self.totActApd[isq][i] / nPatns self.cumIsd = Patent.Cumulate(self.nPatns) self.cumCitesMade = Patent.Cumulate(self.nCitesMade) self.cumCitesPerPatn = [1.0 * c / n for c,n in zip(self.cumCitesMade,self.cumIsd)]
def parseAPD(self, match, fPatn): if re.search(r'[12][0-9]{5}00', match.group(1)): # occurs not infrequently # I trust that this will never go wrong # self.patn.apd = datetime.datetime.strptime(match.group(1), "%Y%m00").date() apd = datetime.datetime.strptime(match.group(1), "%Y%m00").date() self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time()) self.patn['apq'] = Patent.d2q(self.patn['apd']) else: try: apd = datetime.datetime.strptime(match.group(1), "%Y%m%d").date() self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time()) except ValueError: # this happens frequently, often subtly wrong: 1980-06-31 &c # so we'll chop the end off to preserve some date apd = datetime.datetime.strptime(match.group(1)[0:6], "%Y%m").date() self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time()) logging.warning("Bad apd date: '%s' in %d in %s", match.group(1), self.patn['pno'], self.fn) self.patn['apq'] = Patent.d2q(self.patn['apd']) line = fPatn.next() match = re.match(self.reTTL, line) self.patn['title'] = '' if match: self.patn['title'] = make_unicode(str(match.group(1).rstrip())) line = fPatn.next() # only advance if matched else: # happens to 5001050 in 1991.dat (only) logging.warning("APD w/o TTL in %d in %s", self.patn['pno'], self.fn) self.badPatns[self.patn['pno']] = self.patn match = re.match(self.reMoreTTL, line) while match: self.patn['title'] += str(match.group(1).rstrip()) line = fPatn.next() if re.match(self.rePATN, line) or re.match(self.reWKU, line): # has never happened logging.warning("Found PATN/WKU looking for title in %d in %s", self.patn['pno'], self.fn) self.badPatns[self.patn['pno']] = self.patn break match = re.match(self.reMoreTTL, line) match = re.match(self.reISD, line) if match: try: isd = datetime.datetime.strptime(match.group(1), "%Y%m%d").date() self.patn['isd'] = datetime.datetime.combine(isd, datetime.datetime.min.time()) except ValueError: # never happens logging.warning("Bad isd date: '%s' in %d in %s", match.group(1), self.patn['pno'], self.fn) self.badPatns[self.patn['pno']] = self.patn else: self.patn['isq'] = Patent.d2q(self.patn['isd']) else: # never happens logging.warning("TTL w/o ISD in %d in %s", self.patn['pno'], self.fn) self.badPatns[self.patn['pno']] = self.patn self.state = 'ASSG/CLAS'
def handFix(patns): # bad, but fixable, apds patns[3943504].apd = datetime.date(1975, 2, 25) # not 2975 patns[3964954].apd = datetime.date(1973, 5, 31) # not 9173 patns[3969699].apd = datetime.date(1975, 4, 11) # not 9175 patns[4010353].apd = datetime.date(1974, 9, 11) # not 9174 patns[4020425].apd = datetime.date(1976, 3, 26) # not 2976 patns[4032532].apd = datetime.date(1973, 3, 1) # not 9173 patns[4041523].apd = datetime.date(1976, 6, 1) # not 9176 patns[4135654].apd = datetime.date(1977, 4, 11) # not 9177 patns[4198308].apd = datetime.date(1978, 7, 21) # not 7978 patns[4255928].apd = datetime.date(1978, 12, 11) # not 9178 patns[4474874].apd = datetime.date(1983, 3, 11) # not 9183 patns[4542062].apd = datetime.date(1982, 1, 20) # not 2982 patns[4596904].apd = datetime.date(1984, 5, 25) # not 2984 patns[4709214].apd = datetime.date(1986, 4, 28) # not 2986 patns[4725260].apd = datetime.date(1987, 3, 24) # not 2987 patns[4732727].apd = datetime.date(1986, 4, 3) # not 9186 patns[4739365].apd = datetime.date(1987, 5, 28) # not 2987 for pno in [3943504, 3964954, 3969699, 4010353, 4020425, 4032532, 4041523, 4135654, 4198308,\ 4255928, 4474874, 4542062, 4596904, 4709214, 4725260, 4732727, 4739365]: patns[pno].apq = Patent.d2q(patns[pno].apd) # datetime.date(8198, 4, 5) ???? even on patn image! if hasattr(patns[4469216], 'apd'): del(patns[4469216].apd, patns[4469216].apq)
def getfrompatenttype(): getdata = Patent.getstat() conn = sqlite3.connect('data.db') # 创建一个Cursor: cursor = conn.cursor() try: # 使用try,可以在已被创建时,继续运行代码 cursor.execute('create table patent2016 ( \ type varchar(20) primary key, \ patentnum varchar(20))') except sqlite3.OperationalError: print('数据库已被创建!') else: print("成功创建数据库!") for i in range(0, 200): # 最近八年总情况 if getdata['returndata']['datanodes'][i]['wds'][1][ 'valuecode'] == '2016': insertdata(cursor, 'patent2016', str(int(i / 10)), 'patentnum', getdata['returndata']['datanodes'][i]['data']['data'], type='type') # primary key为type # print(type(getdata['returndata']['datanodes'][i]['data']['strdata'])) # 关闭Cursor: cursor.close() # 提交事务: conn.commit() # 关闭Connection: conn.close()
def handFix(patns): # bad, but fixable, apds patns[3943504].apd = datetime.date(1975, 2, 25) # not 2975 patns[3964954].apd = datetime.date(1973, 5, 31) # not 9173 patns[3969699].apd = datetime.date(1975, 4, 11) # not 9175 patns[4010353].apd = datetime.date(1974, 9, 11) # not 9174 patns[4020425].apd = datetime.date(1976, 3, 26) # not 2976 patns[4032532].apd = datetime.date(1973, 3, 1) # not 9173 patns[4041523].apd = datetime.date(1976, 6, 1) # not 9176 patns[4135654].apd = datetime.date(1977, 4, 11) # not 9177 patns[4198308].apd = datetime.date(1978, 7, 21) # not 7978 patns[4255928].apd = datetime.date(1978, 12, 11) # not 9178 patns[4474874].apd = datetime.date(1983, 3, 11) # not 9183 patns[4542062].apd = datetime.date(1982, 1, 20) # not 2982 patns[4596904].apd = datetime.date(1984, 5, 25) # not 2984 patns[4709214].apd = datetime.date(1986, 4, 28) # not 2986 patns[4725260].apd = datetime.date(1987, 3, 24) # not 2987 patns[4732727].apd = datetime.date(1986, 4, 3) # not 9186 patns[4739365].apd = datetime.date(1987, 5, 28) # not 2987 for pno in [3943504, 3964954, 3969699, 4010353, 4020425, 4032532, 4041523, 4135654, 4198308,\ 4255928, 4474874, 4542062, 4596904, 4709214, 4725260, 4732727, 4739365]: patns[pno].apq = Patent.d2q(patns[pno].apd) # datetime.date(8198, 4, 5) ???? even on patn image! if hasattr(patns[4469216], 'apd'): del (patns[4469216].apd, patns[4469216].apq)
def Activity(patn, f): '''Returns the patent's cumulative hits, weighted via f(patn, citingPatn)''' act = [0] * nQuarters for citingPatnNo in patn.citedby: act[patns[citingPatnNo].isq] += f(patns[citingPatnNo], patn) return Patent.Cumulate(act)
def _create_patent_objects(self): """ Create a Patent object for each patent id and store them in a list """ print('Creation of the patent Python objects') self.patent_list = [] for patent_id in list(self.patent_ids): a = Patent(patent_id) # updated self.patent_list.append(a) # updated
def parseXMLDom(self, dom): elmPubRef = dom.getElementsByTagName('publication-reference')[0].getElementsByTagName('document-id')[0] try: pno = int(elmPubRef.getElementsByTagName('doc-number')[0].childNodes[0].data) self.patn = { 'rawcites' : [], 'cites' : [], 'citedby': [], 'pno' : pno } except ValueError: # presume that pno found is not a utility patent and ignore return # self.patn = Patent.Patent(pno) isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data # self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date() isd = datetime.datetime.strptime(isd, "%Y%m%d").date() # Mongo cannot accept dates, only date+time. Gotta pad isd with time = midnight. self.patn['isd'] = datetime.datetime.combine(isd, datetime.datetime.min.time()) self.patn['isq'] = Patent.d2q(self.patn['isd']) elmAppRef = dom.getElementsByTagName('application-reference')[0].getElementsByTagName('document-id')[0] apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data # self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date() apd = datetime.datetime.strptime(apd, "%Y%m%d").date() # again, have to pad the date to make it a date+time self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time()) self.patn['apq'] = Patent.d2q(self.patn['apd']) # NB: may be out of nQuarters range uspc = dom.getElementsByTagName('classification-national')[0] uspc = uspc.getElementsByTagName('main-classification')[0].childNodes[0].data # self.patn.uspc = str(uspc.encode('ascii','replace')) self.patn['uspc'] = str(uspc.encode('ascii','replace')) ipc = (dom.getElementsByTagName('classification-ipc') + dom.getElementsByTagName('classification-ipcr'))[0] try: # for classification-ipcr which breaks out each part of the IPC # section-class-subclass group/subgroup # NB: this is not the same format for these as in the DAT files ipc = "%s%s%s %s/%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:14] if x.nodeType == 1]) except IndexError: # sometimes the main-group is just <main-group/> instead of a real value # this is treated as '1' in the online database # so that's what we'll do too ipc1 = "%s%s%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:10] if x.nodeType == 1]) ipc2 = " 01/%s" % (ipc.childNodes[13],) ipc = ipc1 + ipc2 except TypeError: # for classification-ipc which just gives a single string for each IPC ipc = ipc.getElementsByTagName('main-classification')[0].childNodes[0].data # self.patn.ipc = str(ipc.encode('ascii','replace')) self.patn['ipc'] = str(ipc.encode('ascii','replace')) elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes # self.patn.title = '' self.patn['title'] = '' for node in elmsTitles: # sometimes the title has subelements like italic text or what have you # sometimes the subelements don't have text at the bottom # 7632827, I'm looking at you here while node.nodeType != node.TEXT_NODE and node.childNodes: node = node.childNodes[0] if node.nodeType == node.TEXT_NODE: # self.patn.title += str(node.data.encode('ascii','replace')) self.patn['title'] += str(node.data.encode('ascii','replace')) else: # logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node) logging.warning('Skipped part of title %d in %s: %s', self.patn['pno'], self.fn, node) elmAssig = dom.getElementsByTagName('assignees') if elmAssig: elmAssig = elmAssig[0] # sometimes it's an orgname, sometimes first + last, get all ass = (elmAssig.getElementsByTagName('orgname') + elmAssig.getElementsByTagName('first-name') + elmAssig.getElementsByTagName('last-name')) # self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass])) self.patn['assignee'] = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass])) # DB: I wrote this part to load the abstracts, copying the elmAssig codeblock above elmAbstract = dom.getElementsByTagName('abstract') if elmAbstract: elmAbstract = elmAbstract[0] pgraphs = elmAbstract.getElementsByTagName('p') # handleTok defined/explained below # dressing around handleTok taken from Andy's code abs = str(self.handleTok(pgraphs).encode('ascii','replace')) # "Element instance has no attribute 'data' ": is this because some # abstracts are long enough to be stored by xmldom as two nodes? # see https://mail.python.org/pipermail/tutor/2004-July/030397.html # self.patn['abstract'] = str('\n'.join([p.childNodes[0].data.encode('ascii','replace') for p in pgraphs])) self.patn['abstract'] = abs self.patn['rawcites'] = [] elmRefCit = dom.getElementsByTagName('references-cited') if not elmRefCit: # the data changes citation field-name convention between '05 and '14. elmRefCit = dom.getElementsByTagName('us-references-cited') if elmRefCit: for cite in elmRefCit[0].getElementsByTagName('patcit'): if cite.getElementsByTagName('country')[0].childNodes[0].data == 'US': try: pno = int(cite.getElementsByTagName('doc-number')[0].childNodes[0].data) except ValueError: # presume that pno cited is not a utility patent and ignore continue else: # self.patn.rawcites.append(pno) self.patn['rawcites'].append(pno) else: logging.warning('No citations for %d in %s: %s', self.patn['pno'], self.fn, node) return self.patn
def parseXMLDom(self, dom): elmPubRef = dom.getElementsByTagName('publication-reference')[0].getElementsByTagName('document-id')[0] try: pno = int(elmPubRef.getElementsByTagName('doc-number')[0].childNodes[0].data) except ValueError: # presume that pno found is not a utility patent and ignore return self.patn = Patent.Patent(pno) isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date() self.patn.isq = Patent.d2q(self.patn.isd) elmAppRef = dom.getElementsByTagName('application-reference')[0].getElementsByTagName('document-id')[0] apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date() self.patn.apq = Patent.d2q(self.patn.apd) # NB: may be out of nQuarters range uspc = dom.getElementsByTagName('classification-national')[0] uspc = uspc.getElementsByTagName('main-classification')[0].childNodes[0].data self.patn.uspc = str(uspc.encode('ascii','replace')) # they switched from classification-ipc to classification-ipcr at some point, search both ipc = (dom.getElementsByTagName('classification-ipc') + dom.getElementsByTagName('classification-ipcr'))[0] try: # for classification-ipcr which breaks out each part of the IPC # section-class-subclass group/subgroup # NB: this is not the same format for these as in the DAT files ipc = "%s%s%s %s/%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:14] if x.nodeType == 1]) except IndexError: # sometimes the main-group is just <main-group/> instead of a real value # this is treated as '1' in the online database # so that's what we'll do too ipc1 = "%s%s%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:10] if x.nodeType == 1]) ipc2 = " 01/%s" % (ipc.childNodes[13],) ipc = ipc1 + ipc2 except TypeError: # for classification-ipc which just gives a single string for each IPC ipc = ipc.getElementsByTagName('main-classification')[0].childNodes[0].data self.patn.ipc = str(ipc.encode('ascii','replace')) elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes self.patn.title = '' for node in elmsTitles: # sometimes the title has subelements like italic text or what have you # sometimes the subelements don't have text at the bottom # 7632827, I'm looking at you here while node.nodeType != node.TEXT_NODE and node.childNodes: node = node.childNodes[0] if node.nodeType == node.TEXT_NODE: self.patn.title += str(node.data.encode('ascii','replace')) else: logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node) elmAssig = dom.getElementsByTagName('assignees') if elmAssig: elmAssig = elmAssig[0] # sometimes it's an orgname, sometimes first + last, get all ass = (elmAssig.getElementsByTagName('orgname') + elmAssig.getElementsByTagName('first-name') + elmAssig.getElementsByTagName('last-name')) self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass])) elmRefCit = dom.getElementsByTagName('references-cited') self.patn.rawcites = [] if elmRefCit: for cite in elmRefCit[0].getElementsByTagName('patcit'): if cite.getElementsByTagName('country')[0].childNodes[0].data == 'US': try: pno = int(cite.getElementsByTagName('doc-number')[0].childNodes[0].data) except ValueError: # presume that pno cited is not a utility patent and ignore continue else: self.patn.rawcites.append(pno) return self.patn
def parseXMLDom(self, dom): elmPubRef = dom.getElementsByTagName( 'publication-reference')[0].getElementsByTagName('document-id')[0] try: pno = int( elmPubRef.getElementsByTagName('doc-number') [0].childNodes[0].data) self.patn = { 'rawcites': [], 'cites': [], 'citedby': [], 'pno': pno } except ValueError: # presume that pno found is not a utility patent and ignore return # self.patn = Patent.Patent(pno) isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data # self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date() isd = datetime.datetime.strptime(isd, "%Y%m%d").date() # Mongo cannot accept dates, only date+time. Gotta pad isd with time = midnight. self.patn['isd'] = datetime.datetime.combine( isd, datetime.datetime.min.time()) self.patn['isq'] = Patent.d2q(self.patn['isd']) elmAppRef = dom.getElementsByTagName( 'application-reference')[0].getElementsByTagName('document-id')[0] apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data # self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date() apd = datetime.datetime.strptime(apd, "%Y%m%d").date() # again, have to pad the date to make it a date+time self.patn['apd'] = datetime.datetime.combine( apd, datetime.datetime.min.time()) self.patn['apq'] = Patent.d2q( self.patn['apd']) # NB: may be out of nQuarters range uspc = dom.getElementsByTagName('classification-national')[0] uspc = uspc.getElementsByTagName( 'main-classification')[0].childNodes[0].data # self.patn.uspc = str(uspc.encode('ascii','replace')) self.patn['uspc'] = str(uspc.encode('ascii', 'replace')) ipc = (dom.getElementsByTagName('classification-ipc') + dom.getElementsByTagName('classification-ipcr'))[0] try: # for classification-ipcr which breaks out each part of the IPC # section-class-subclass group/subgroup # NB: this is not the same format for these as in the DAT files ipc = "%s%s%s %s/%s" % tuple([ x.childNodes[0].data for x in ipc.childNodes[5:14] if x.nodeType == 1 ]) except IndexError: # sometimes the main-group is just <main-group/> instead of a real value # this is treated as '1' in the online database # so that's what we'll do too ipc1 = "%s%s%s" % tuple([ x.childNodes[0].data for x in ipc.childNodes[5:10] if x.nodeType == 1 ]) ipc2 = " 01/%s" % (ipc.childNodes[13], ) ipc = ipc1 + ipc2 except TypeError: # for classification-ipc which just gives a single string for each IPC ipc = ipc.getElementsByTagName( 'main-classification')[0].childNodes[0].data # self.patn.ipc = str(ipc.encode('ascii','replace')) self.patn['ipc'] = str(ipc.encode('ascii', 'replace')) elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes # self.patn.title = '' self.patn['title'] = '' for node in elmsTitles: # sometimes the title has subelements like italic text or what have you # sometimes the subelements don't have text at the bottom # 7632827, I'm looking at you here while node.nodeType != node.TEXT_NODE and node.childNodes: node = node.childNodes[0] if node.nodeType == node.TEXT_NODE: # self.patn.title += str(node.data.encode('ascii','replace')) self.patn['title'] += str(node.data.encode('ascii', 'replace')) else: # logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node) logging.warning('Skipped part of title %d in %s: %s', self.patn['pno'], self.fn, node) elmAssig = dom.getElementsByTagName('assignees') if elmAssig: elmAssig = elmAssig[0] # sometimes it's an orgname, sometimes first + last, get all ass = (elmAssig.getElementsByTagName('orgname') + elmAssig.getElementsByTagName('first-name') + elmAssig.getElementsByTagName('last-name')) # self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass])) self.patn['assignee'] = str(' '.join([ x.childNodes[0].data.encode('ascii', 'replace') for x in ass ])) # DB: I wrote this part to load the abstracts, copying the elmAssig codeblock above elmAbstract = dom.getElementsByTagName('abstract') if elmAbstract: elmAbstract = elmAbstract[0] pgraphs = elmAbstract.getElementsByTagName('p') # handleTok defined/explained below # dressing around handleTok taken from Andy's code abs = str(self.handleTok(pgraphs).encode('ascii', 'replace')) # "Element instance has no attribute 'data' ": is this because some # abstracts are long enough to be stored by xmldom as two nodes? # see https://mail.python.org/pipermail/tutor/2004-July/030397.html # self.patn['abstract'] = str('\n'.join([p.childNodes[0].data.encode('ascii','replace') for p in pgraphs])) self.patn['abstract'] = abs self.patn['rawcites'] = [] elmRefCit = dom.getElementsByTagName('references-cited') if not elmRefCit: # the data changes citation field-name convention between '05 and '14. elmRefCit = dom.getElementsByTagName('us-references-cited') if elmRefCit: for cite in elmRefCit[0].getElementsByTagName('patcit'): if cite.getElementsByTagName( 'country')[0].childNodes[0].data == 'US': try: pno = int( cite.getElementsByTagName('doc-number') [0].childNodes[0].data) except ValueError: # presume that pno cited is not a utility patent and ignore continue else: # self.patn.rawcites.append(pno) self.patn['rawcites'].append(pno) else: logging.warning('No citations for %d in %s: %s', self.patn['pno'], self.fn, node) return self.patn