Exemple #1
0
	def Stats(self, patns, nQuarters):
		'''Collect all the statistics needed'''
		for patn in patns.itervalues():
			if patn.pno % 100000 == 0:
				print '\r', patn.pno,
				
			isq = patn.isq
			self.nPatns[isq] += 1
			self.nRawCitesMade[isq] += len(patn.rawcites)
			self.nCitesMade[isq] += len(patn.cites)
			self.nCitesRecd[isq] += len(patn.citedby)
			
			# activity stuff
			self.totAct[isq] = [a+b for a,b in zip(self.totAct[isq],patn.Activity(patns))]
			self.totActApd[isq] = [a+b for a,b in zip(self.totActApd[isq],patn.ActivityApd(patns))]
		print
			
		for isq in range(nQuarters):
			nPatns = self.nPatns[isq]
			if nPatns < 1:	# mostly for testing
				continue
			self.avgRawCitesMade[isq] = 1.0 * self.nRawCitesMade[isq] / nPatns
			self.avgCitesMade[isq] = 1.0 * self.nCitesMade[isq] / nPatns
			self.avgCitesRecd[isq] = 1.0 * self.nCitesRecd[isq] / nPatns
			for i in range(nQuarters):
				self.avgAct[isq][i] = 1.0 * self.totAct[isq][i] / nPatns
				self.avgActApd[isq][i] = 1.0 * self.totActApd[isq][i] / nPatns
		
		self.cumIsd = Patent.Cumulate(self.nPatns)
		self.cumCitesMade = Patent.Cumulate(self.nCitesMade)
		self.cumCitesPerPatn = [1.0 * c / n for c,n in zip(self.cumCitesMade,self.cumIsd)]
Exemple #2
0
	def parseAPD(self, match, fPatn):
		if re.search(r'[12][0-9]{5}00', match.group(1)):	# occurs not infrequently
			# I trust that this will never go wrong
			# self.patn.apd = datetime.datetime.strptime(match.group(1), "%Y%m00").date()
			apd = datetime.datetime.strptime(match.group(1), "%Y%m00").date()
			self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time())
			self.patn['apq'] = Patent.d2q(self.patn['apd'])
		else:
			try:
				apd = datetime.datetime.strptime(match.group(1), "%Y%m%d").date()
				self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time())
			except ValueError:
				# this happens frequently, often subtly wrong: 1980-06-31 &c
				# so we'll chop the end off to preserve some date
				apd = datetime.datetime.strptime(match.group(1)[0:6], "%Y%m").date()
				self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time())
				logging.warning("Bad apd date: '%s' in %d in %s", match.group(1), self.patn['pno'], self.fn)
			self.patn['apq'] = Patent.d2q(self.patn['apd'])
		
		line = fPatn.next()
		
		match = re.match(self.reTTL, line)
		self.patn['title'] = ''
		if match:
			self.patn['title'] = make_unicode(str(match.group(1).rstrip()))
			line = fPatn.next() # only advance if matched
		else:
			# happens to 5001050 in 1991.dat (only)
			logging.warning("APD w/o TTL in %d in %s", self.patn['pno'], self.fn)
			self.badPatns[self.patn['pno']] = self.patn
		
		match = re.match(self.reMoreTTL, line)
		while match:
			self.patn['title'] += str(match.group(1).rstrip())
			line = fPatn.next()
			if re.match(self.rePATN, line) or re.match(self.reWKU, line):
				# has never happened
				logging.warning("Found PATN/WKU looking for title in %d in %s", self.patn['pno'], self.fn)
				self.badPatns[self.patn['pno']] = self.patn
				break
			match = re.match(self.reMoreTTL, line)
		
		match = re.match(self.reISD, line)
		if match:
			try:
				isd = datetime.datetime.strptime(match.group(1), "%Y%m%d").date()
				self.patn['isd'] = datetime.datetime.combine(isd, datetime.datetime.min.time())
			except ValueError:
				# never happens
				logging.warning("Bad isd date: '%s' in %d in %s", match.group(1), self.patn['pno'], self.fn)
				self.badPatns[self.patn['pno']] = self.patn
			else:
				self.patn['isq'] = Patent.d2q(self.patn['isd'])
		else:
			# never happens
			logging.warning("TTL w/o ISD in %d in %s", self.patn['pno'], self.fn)
			self.badPatns[self.patn['pno']] = self.patn
		self.state = 'ASSG/CLAS'
Exemple #3
0
	def handFix(patns):
		# bad, but fixable, apds
		patns[3943504].apd = datetime.date(1975, 2, 25) # not 2975
		patns[3964954].apd = datetime.date(1973, 5, 31) # not 9173
		patns[3969699].apd = datetime.date(1975, 4, 11) # not 9175
		patns[4010353].apd = datetime.date(1974, 9, 11) # not 9174
		patns[4020425].apd = datetime.date(1976, 3, 26) # not 2976
		patns[4032532].apd = datetime.date(1973, 3, 1) # not 9173
		patns[4041523].apd = datetime.date(1976, 6, 1) # not 9176
		patns[4135654].apd = datetime.date(1977, 4, 11) # not 9177
		patns[4198308].apd = datetime.date(1978, 7, 21) # not 7978
		patns[4255928].apd = datetime.date(1978, 12, 11) # not 9178
		patns[4474874].apd = datetime.date(1983, 3, 11) # not 9183
		patns[4542062].apd = datetime.date(1982, 1, 20) # not 2982
		patns[4596904].apd = datetime.date(1984, 5, 25) # not 2984
		patns[4709214].apd = datetime.date(1986, 4, 28) # not 2986
		patns[4725260].apd = datetime.date(1987, 3, 24) # not 2987
		patns[4732727].apd = datetime.date(1986, 4, 3) # not 9186
		patns[4739365].apd = datetime.date(1987, 5, 28) # not 2987
		for pno in [3943504, 3964954, 3969699, 4010353, 4020425, 4032532, 4041523, 4135654, 4198308,\
			4255928, 4474874, 4542062, 4596904, 4709214, 4725260, 4732727, 4739365]:
			patns[pno].apq = Patent.d2q(patns[pno].apd)
			
		# datetime.date(8198, 4, 5) ???? even on patn image!
		if hasattr(patns[4469216], 'apd'):
			del(patns[4469216].apd, patns[4469216].apq)
Exemple #4
0
def getfrompatenttype():
    getdata = Patent.getstat()
    conn = sqlite3.connect('data.db')
    # 创建一个Cursor:
    cursor = conn.cursor()
    try:  # 使用try,可以在已被创建时,继续运行代码
        cursor.execute('create table patent2016 (   \
                        type varchar(20) primary key,   \
                        patentnum varchar(20))')
    except sqlite3.OperationalError:
        print('数据库已被创建!')
    else:
        print("成功创建数据库!")

    for i in range(0, 200):  # 最近八年总情况
        if getdata['returndata']['datanodes'][i]['wds'][1][
                'valuecode'] == '2016':
            insertdata(cursor,
                       'patent2016',
                       str(int(i / 10)),
                       'patentnum',
                       getdata['returndata']['datanodes'][i]['data']['data'],
                       type='type')  # primary key为type
            # print(type(getdata['returndata']['datanodes'][i]['data']['strdata']))
    # 关闭Cursor:
    cursor.close()
    # 提交事务:
    conn.commit()
    # 关闭Connection:
    conn.close()
    def handFix(patns):
        # bad, but fixable, apds
        patns[3943504].apd = datetime.date(1975, 2, 25)  # not 2975
        patns[3964954].apd = datetime.date(1973, 5, 31)  # not 9173
        patns[3969699].apd = datetime.date(1975, 4, 11)  # not 9175
        patns[4010353].apd = datetime.date(1974, 9, 11)  # not 9174
        patns[4020425].apd = datetime.date(1976, 3, 26)  # not 2976
        patns[4032532].apd = datetime.date(1973, 3, 1)  # not 9173
        patns[4041523].apd = datetime.date(1976, 6, 1)  # not 9176
        patns[4135654].apd = datetime.date(1977, 4, 11)  # not 9177
        patns[4198308].apd = datetime.date(1978, 7, 21)  # not 7978
        patns[4255928].apd = datetime.date(1978, 12, 11)  # not 9178
        patns[4474874].apd = datetime.date(1983, 3, 11)  # not 9183
        patns[4542062].apd = datetime.date(1982, 1, 20)  # not 2982
        patns[4596904].apd = datetime.date(1984, 5, 25)  # not 2984
        patns[4709214].apd = datetime.date(1986, 4, 28)  # not 2986
        patns[4725260].apd = datetime.date(1987, 3, 24)  # not 2987
        patns[4732727].apd = datetime.date(1986, 4, 3)  # not 9186
        patns[4739365].apd = datetime.date(1987, 5, 28)  # not 2987
        for pno in [3943504, 3964954, 3969699, 4010353, 4020425, 4032532, 4041523, 4135654, 4198308,\
         4255928, 4474874, 4542062, 4596904, 4709214, 4725260, 4732727, 4739365]:
            patns[pno].apq = Patent.d2q(patns[pno].apd)

        # datetime.date(8198, 4, 5) ???? even on patn image!
        if hasattr(patns[4469216], 'apd'):
            del (patns[4469216].apd, patns[4469216].apq)
Exemple #6
0
def Activity(patn, f):
	'''Returns the patent's cumulative hits, weighted via f(patn, citingPatn)'''
	act = [0] * nQuarters
	
	for citingPatnNo in patn.citedby:
		act[patns[citingPatnNo].isq] += f(patns[citingPatnNo], patn)
	
	return Patent.Cumulate(act)
Exemple #7
0
 def _create_patent_objects(self):
     """
     Create a Patent object for each patent id and store them in a list
     """
     print('Creation of the patent Python objects')
     self.patent_list = []
     for patent_id in list(self.patent_ids):
         a = Patent(patent_id)  # updated
         self.patent_list.append(a)  # updated
Exemple #8
0
	def parseXMLDom(self, dom):
		
		elmPubRef = dom.getElementsByTagName('publication-reference')[0].getElementsByTagName('document-id')[0]
		try:
			pno = int(elmPubRef.getElementsByTagName('doc-number')[0].childNodes[0].data)
			self.patn = {
				'rawcites' : [],
				'cites' : [],
				'citedby': [],
				'pno' : pno
			}
		except ValueError:
			# presume that pno found is not a utility patent and ignore
			return
		
		# self.patn = Patent.Patent(pno)
		
		isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data
		# self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
		isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
		# Mongo cannot accept dates, only date+time. Gotta pad isd with time = midnight.
		self.patn['isd'] = datetime.datetime.combine(isd, datetime.datetime.min.time())
		self.patn['isq'] = Patent.d2q(self.patn['isd'])
		
		elmAppRef = dom.getElementsByTagName('application-reference')[0].getElementsByTagName('document-id')[0]
		apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data
		# self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
		apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
		# again, have to pad the date to make it a date+time
		self.patn['apd'] = datetime.datetime.combine(apd, datetime.datetime.min.time())
		self.patn['apq'] = Patent.d2q(self.patn['apd'])	# NB: may be out of nQuarters range
		
		uspc = dom.getElementsByTagName('classification-national')[0]
		uspc = uspc.getElementsByTagName('main-classification')[0].childNodes[0].data
		
		# self.patn.uspc = str(uspc.encode('ascii','replace'))
		self.patn['uspc'] = str(uspc.encode('ascii','replace'))
		
		ipc = (dom.getElementsByTagName('classification-ipc') + dom.getElementsByTagName('classification-ipcr'))[0]
				
		
		try:
		# for classification-ipcr which breaks out each part of the IPC
		# section-class-subclass group/subgroup
		# NB: this is not the same format for these as in the DAT files
			ipc = "%s%s%s %s/%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:14] if x.nodeType == 1])
		except IndexError:
		# sometimes the main-group is just <main-group/> instead of a real value
		# this is treated as '1' in the online database
		# so that's what we'll do too
			ipc1 = "%s%s%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:10] if x.nodeType == 1])
			ipc2 = " 01/%s" % (ipc.childNodes[13],)
			ipc = ipc1 + ipc2
		except TypeError:
		# for classification-ipc which just gives a single string for each IPC
			ipc = ipc.getElementsByTagName('main-classification')[0].childNodes[0].data
		
		# self.patn.ipc = str(ipc.encode('ascii','replace'))
		self.patn['ipc'] = str(ipc.encode('ascii','replace'))
			
		
		elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes
		# self.patn.title = ''
		self.patn['title'] = ''
		for node in elmsTitles:
			# sometimes the title has subelements like italic text or what have you
			# sometimes the subelements don't have text at the bottom
			# 7632827, I'm looking at you here
			while node.nodeType != node.TEXT_NODE and node.childNodes:
				node = node.childNodes[0]
			if node.nodeType == node.TEXT_NODE:
				# self.patn.title += str(node.data.encode('ascii','replace'))
				self.patn['title'] += str(node.data.encode('ascii','replace'))
			else:
				# logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node)
				logging.warning('Skipped part of title %d in %s: %s', self.patn['pno'], self.fn, node)
		
		
		elmAssig = dom.getElementsByTagName('assignees')
		
		if elmAssig:
			elmAssig = elmAssig[0]
			# sometimes it's an orgname, sometimes first + last, get all
			ass = (elmAssig.getElementsByTagName('orgname') + elmAssig.getElementsByTagName('first-name') + elmAssig.getElementsByTagName('last-name'))
			# self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass]))
			self.patn['assignee'] = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass]))

		# DB: I wrote this part to load the abstracts, copying the elmAssig codeblock above
		elmAbstract = dom.getElementsByTagName('abstract')
		if elmAbstract:
			elmAbstract = elmAbstract[0]
			pgraphs = elmAbstract.getElementsByTagName('p')
			# handleTok defined/explained below
			# dressing around handleTok taken from Andy's code
			abs = str(self.handleTok(pgraphs).encode('ascii','replace'))
			# "Element instance has no attribute 'data' ": is this because some
			# abstracts are long enough to be stored by xmldom as two nodes?
			# see https://mail.python.org/pipermail/tutor/2004-July/030397.html
			# self.patn['abstract'] = str('\n'.join([p.childNodes[0].data.encode('ascii','replace') for p in pgraphs]))
			self.patn['abstract'] = abs



		self.patn['rawcites'] = []
		elmRefCit = dom.getElementsByTagName('references-cited')
		if not elmRefCit:
			# the data changes citation field-name convention between '05 and '14.
			elmRefCit = dom.getElementsByTagName('us-references-cited')
		
		if elmRefCit:
			for cite in elmRefCit[0].getElementsByTagName('patcit'):
				if cite.getElementsByTagName('country')[0].childNodes[0].data == 'US':
					try:
						pno = int(cite.getElementsByTagName('doc-number')[0].childNodes[0].data)
					except ValueError:
						# presume that pno cited is not a utility patent and ignore
						continue
					else:
						# self.patn.rawcites.append(pno)
						self.patn['rawcites'].append(pno)
		else:
			logging.warning('No citations for %d in %s: %s', self.patn['pno'], self.fn, node)


		return self.patn
Exemple #9
0
	def parseXMLDom(self, dom):
		elmPubRef = dom.getElementsByTagName('publication-reference')[0].getElementsByTagName('document-id')[0]
		try:
			pno = int(elmPubRef.getElementsByTagName('doc-number')[0].childNodes[0].data)
		except ValueError:
			# presume that pno found is not a utility patent and ignore
			return

		self.patn = Patent.Patent(pno)
		isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data
		self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
		self.patn.isq = Patent.d2q(self.patn.isd)

		elmAppRef = dom.getElementsByTagName('application-reference')[0].getElementsByTagName('document-id')[0]
		apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data
		self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
		self.patn.apq = Patent.d2q(self.patn.apd)	# NB: may be out of nQuarters range
		
		uspc = dom.getElementsByTagName('classification-national')[0]
		uspc = uspc.getElementsByTagName('main-classification')[0].childNodes[0].data
		self.patn.uspc = str(uspc.encode('ascii','replace'))
		
		# they switched from classification-ipc to classification-ipcr at some point, search both
		ipc = (dom.getElementsByTagName('classification-ipc') + dom.getElementsByTagName('classification-ipcr'))[0]
		try:
			# for classification-ipcr which breaks out each part of the IPC
			# section-class-subclass group/subgroup
			# NB: this is not the same format for these as in the DAT files
			ipc = "%s%s%s %s/%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:14] if x.nodeType == 1])
		except IndexError:
			# sometimes the main-group is just <main-group/> instead of a real value
			# this is treated as '1' in the online database
			# so that's what we'll do too
			ipc1 = "%s%s%s" % tuple([x.childNodes[0].data for x in ipc.childNodes[5:10] if x.nodeType == 1])
			ipc2 = " 01/%s" % (ipc.childNodes[13],)
			ipc = ipc1 + ipc2
		except TypeError:
			# for classification-ipc which just gives a single string for each IPC
			ipc = ipc.getElementsByTagName('main-classification')[0].childNodes[0].data
		self.patn.ipc = str(ipc.encode('ascii','replace'))

		elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes
		self.patn.title = ''
		for node in elmsTitles:
			# sometimes the title has subelements like italic text or what have you
			# sometimes the subelements don't have text at the bottom
			# 7632827, I'm looking at you here
			while node.nodeType != node.TEXT_NODE and node.childNodes:
				node = node.childNodes[0]
			if node.nodeType == node.TEXT_NODE:
				self.patn.title += str(node.data.encode('ascii','replace'))
			else:
				logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node)

		elmAssig = dom.getElementsByTagName('assignees')
		if elmAssig:
			elmAssig = elmAssig[0]
			# sometimes it's an orgname, sometimes first + last, get all
			ass = (elmAssig.getElementsByTagName('orgname') + elmAssig.getElementsByTagName('first-name') + elmAssig.getElementsByTagName('last-name'))
			self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass]))

		elmRefCit = dom.getElementsByTagName('references-cited')
		self.patn.rawcites = []
		if elmRefCit:
			for cite in elmRefCit[0].getElementsByTagName('patcit'):
				if cite.getElementsByTagName('country')[0].childNodes[0].data == 'US':
					try:
						pno = int(cite.getElementsByTagName('doc-number')[0].childNodes[0].data)
					except ValueError:
						# presume that pno cited is not a utility patent and ignore
						continue
					else:
						self.patn.rawcites.append(pno)
		return self.patn
Exemple #10
0
    def parseXMLDom(self, dom):

        elmPubRef = dom.getElementsByTagName(
            'publication-reference')[0].getElementsByTagName('document-id')[0]
        try:
            pno = int(
                elmPubRef.getElementsByTagName('doc-number')
                [0].childNodes[0].data)
            self.patn = {
                'rawcites': [],
                'cites': [],
                'citedby': [],
                'pno': pno
            }
        except ValueError:
            # presume that pno found is not a utility patent and ignore
            return

        # self.patn = Patent.Patent(pno)

        isd = elmPubRef.getElementsByTagName('date')[0].childNodes[0].data
        # self.patn.isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
        isd = datetime.datetime.strptime(isd, "%Y%m%d").date()
        # Mongo cannot accept dates, only date+time. Gotta pad isd with time = midnight.
        self.patn['isd'] = datetime.datetime.combine(
            isd, datetime.datetime.min.time())
        self.patn['isq'] = Patent.d2q(self.patn['isd'])

        elmAppRef = dom.getElementsByTagName(
            'application-reference')[0].getElementsByTagName('document-id')[0]
        apd = elmAppRef.getElementsByTagName('date')[0].childNodes[0].data
        # self.patn.apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
        apd = datetime.datetime.strptime(apd, "%Y%m%d").date()
        # again, have to pad the date to make it a date+time
        self.patn['apd'] = datetime.datetime.combine(
            apd, datetime.datetime.min.time())
        self.patn['apq'] = Patent.d2q(
            self.patn['apd'])  # NB: may be out of nQuarters range

        uspc = dom.getElementsByTagName('classification-national')[0]
        uspc = uspc.getElementsByTagName(
            'main-classification')[0].childNodes[0].data

        # self.patn.uspc = str(uspc.encode('ascii','replace'))
        self.patn['uspc'] = str(uspc.encode('ascii', 'replace'))

        ipc = (dom.getElementsByTagName('classification-ipc') +
               dom.getElementsByTagName('classification-ipcr'))[0]

        try:
            # for classification-ipcr which breaks out each part of the IPC
            # section-class-subclass group/subgroup
            # NB: this is not the same format for these as in the DAT files
            ipc = "%s%s%s %s/%s" % tuple([
                x.childNodes[0].data
                for x in ipc.childNodes[5:14] if x.nodeType == 1
            ])
        except IndexError:
            # sometimes the main-group is just <main-group/> instead of a real value
            # this is treated as '1' in the online database
            # so that's what we'll do too
            ipc1 = "%s%s%s" % tuple([
                x.childNodes[0].data
                for x in ipc.childNodes[5:10] if x.nodeType == 1
            ])
            ipc2 = " 01/%s" % (ipc.childNodes[13], )
            ipc = ipc1 + ipc2
        except TypeError:
            # for classification-ipc which just gives a single string for each IPC
            ipc = ipc.getElementsByTagName(
                'main-classification')[0].childNodes[0].data

        # self.patn.ipc = str(ipc.encode('ascii','replace'))
        self.patn['ipc'] = str(ipc.encode('ascii', 'replace'))

        elmsTitles = dom.getElementsByTagName('invention-title')[0].childNodes
        # self.patn.title = ''
        self.patn['title'] = ''
        for node in elmsTitles:
            # sometimes the title has subelements like italic text or what have you
            # sometimes the subelements don't have text at the bottom
            # 7632827, I'm looking at you here
            while node.nodeType != node.TEXT_NODE and node.childNodes:
                node = node.childNodes[0]
            if node.nodeType == node.TEXT_NODE:
                # self.patn.title += str(node.data.encode('ascii','replace'))
                self.patn['title'] += str(node.data.encode('ascii', 'replace'))
            else:
                # logging.warning('Skipped part of title %d in %s: %s', self.patn.pno, self.fn, node)
                logging.warning('Skipped part of title %d in %s: %s',
                                self.patn['pno'], self.fn, node)

        elmAssig = dom.getElementsByTagName('assignees')

        if elmAssig:
            elmAssig = elmAssig[0]
            # sometimes it's an orgname, sometimes first + last, get all
            ass = (elmAssig.getElementsByTagName('orgname') +
                   elmAssig.getElementsByTagName('first-name') +
                   elmAssig.getElementsByTagName('last-name'))
            # self.patn.assignee = str(' '.join([x.childNodes[0].data.encode('ascii','replace') for x in ass]))
            self.patn['assignee'] = str(' '.join([
                x.childNodes[0].data.encode('ascii', 'replace') for x in ass
            ]))

        # DB: I wrote this part to load the abstracts, copying the elmAssig codeblock above
        elmAbstract = dom.getElementsByTagName('abstract')
        if elmAbstract:
            elmAbstract = elmAbstract[0]
            pgraphs = elmAbstract.getElementsByTagName('p')
            # handleTok defined/explained below
            # dressing around handleTok taken from Andy's code
            abs = str(self.handleTok(pgraphs).encode('ascii', 'replace'))
            # "Element instance has no attribute 'data' ": is this because some
            # abstracts are long enough to be stored by xmldom as two nodes?
            # see https://mail.python.org/pipermail/tutor/2004-July/030397.html
            # self.patn['abstract'] = str('\n'.join([p.childNodes[0].data.encode('ascii','replace') for p in pgraphs]))
            self.patn['abstract'] = abs

        self.patn['rawcites'] = []
        elmRefCit = dom.getElementsByTagName('references-cited')
        if not elmRefCit:
            # the data changes citation field-name convention between '05 and '14.
            elmRefCit = dom.getElementsByTagName('us-references-cited')

        if elmRefCit:
            for cite in elmRefCit[0].getElementsByTagName('patcit'):
                if cite.getElementsByTagName(
                        'country')[0].childNodes[0].data == 'US':
                    try:
                        pno = int(
                            cite.getElementsByTagName('doc-number')
                            [0].childNodes[0].data)
                    except ValueError:
                        # presume that pno cited is not a utility patent and ignore
                        continue
                    else:
                        # self.patn.rawcites.append(pno)
                        self.patn['rawcites'].append(pno)
        else:
            logging.warning('No citations for %d in %s: %s', self.patn['pno'],
                            self.fn, node)

        return self.patn