def split_xml(fulldoc, max_iter=(-1)): xml = [] lnum = 1 n_iter = 1 print max_iter print 'Splitting xml, please wait...' found = False for line in fulldoc: enclosing = tags.getEnclosing() if line.find(formatTag(enclosing)[:-1]) >= 0: found = True if found: # Sometimes there is data outside the ipa_enclosing tags which messes up the parser. xml.append(line) if (line.strip().find(formatTag(enclosing, True)) >= 0): found = False # Clone the list and append it to xmldocs xmldocs.append(list(xml)) # Write to file (should be commmented out, for debugging purposes #f = open(getwd() + '/output.csv', 'a') #f.write(''.join(xml)) n_iter += 1 xml = [] patutil.print_over("\rSplit %d on line %d ..." % (n_iter, lnum)) if max_iter >= 0 and n_iter > max_iter: break lnum += 1 print 'Done with length %d.' % len(xmldocs)
def parse_xml(soup, tag): global tags finaltag = None #The tag object which will be printed or returned at the end of the scrape result = 'None' patutil.print_over('\rScraping tag %s.' % (tag)) # (Re)sets subsoup to the top of the xml tree enclosing = tags.getEnclosing() subsoup = soup.find(enclosing) tagtree = tag.split('/') #print 'tagtree length:', len(tagtree) for i in xrange(0, len(tagtree)): if subsoup == None: #print 'WARNING: \'' + tagtree[i - 1] + '\' is none in tag tree:', ', '.join(tagtree) result = 'None' break elif i < len(tagtree) - 1: # If not at the end of the tree subsoup = subsoup.find(tagtree[i]) else: # If at the end of the tree (or if the tree only has one element) finaltag = subsoup.find(tagtree[i]) result = tagString(finaltag) # Below methods assume that the ipa and ipg variables being worked with are the same # Add special formatting for inventors tag if tag == tags.ipa_inventors: templist = [] if finaltag != None: for name in finaltag.find_all('addressbook'): #print name templist.append('[') # Only append if tag contains name (first-name), (last-name), etc. fname = name.find('first-name') mname = name.find('middle-name') lname = name.find('last-name') if fname != None: templist.append(fname.string) if (mname != None): templist.append(' ' + mname.string) templist.append(' ') if lname != None: templist.append(lname.string) templist.append(']') result = ''.join(templist) elif tag == tags.ipa_assignee: templist = [] if finaltag != None: for name in finaltag.find_all('addressbook'): orgname = name.find('orgname') if orgname != None: templist.append('[' + orgname.string + ']') result = ''.join(templist) #print type(result), result return unicode(result)
def reporthook(count, block_size, total_size): global start_time if count == 0: start_time = time.time() return duration = time.time() - start_time progress_size = (count * block_size) speed = int(progress_size / (1024 * duration)) percent = int(count * block_size * 100 / total_size) patutil.print_over("\rProgress: %d%%, %.2f %.2f MB, %d KB/s, %d seconds passed" % (percent, float(progress_size) / (1024 * 1024), float(total_size) / (1024 * 1024), speed, duration))
def scrape(xmllist, nonsf_flag=False): patutil.print_over("\rScraping %s of %s." % (xmliteration + 1, len(xmldocs))) # Gets the government interest field and looks for NSF or national science foundation if (get_govt_interest(xmllist)): print 'Found NSF reference, adding to CSV. <!!!!!!!!!!!' elif not nonsf_flag: return xml = ''.join(xmllist) # Create a string from the singular xml list created in split_xml() if patutil.cmd_args['dump_flag']: patutil.dump_xml(xml, str(xmliteration) + '.xml') # Begin the parse soup = BeautifulSoup(xml, ["lxml", "xml"]) datalist = [] global tags for tag in tags.getTags(year): # Non bs4 parsing if (tag[0:2] == '<?'): # Split start and end tags split = tag.find('>') + 1 tagpair = (tag[0:split], tag[split:]) strfind_result = strfind_tag(tagpair[0], tagpair[1], xmllist) # Hack for alternate way to get cross reference in patent grants in the description element (not sure if interpreting ruby parser's xpath correctly.) if tag != tags.ipg_crossref or (tag == tags.ipg_crossref and strfind_result != None): datalist.append([tag, strfind_result]) else: desc = soup.find('description') if re.sub('[^a-z]', '', desc.find('heading').string.lower()).find('crossref') >= 0: text = desc.string print text datalist.append([tag, text]) else: datalist.append([tag, 'None']) else: datalist.append([tag, parse_xml(soup, tag)]) return datalist
def scrape(xmllist, nonsf_flag=False): patutil.print_over("\rScraping %s of %s." % (xmliteration + 1, len(xmldocs))) # Gets the government interest field and looks for NSF or national science foundation if (get_govt_interest(xmllist)): print 'Found NSF reference, adding to CSV. <!!!!!!!!!!!' elif not nonsf_flag: return xml = ''.join(xmllist) # Create a string from the singular xml list created in split_xml() if patutil.cmd_args['dump_flag']: patutil.dump_xml(xml, str(xmliteration) + '.xml') # Begin the parse soup = BeautifulSoup(xml, ["lxml", "xml"]) datalist = [] global tags for tag in tags.getTags(year): # Non bs4 parsing if (tag[0:2] == '<?'): # Split start and end tags split = tag.find('>') + 1 tagpair = (tag[0:split], tag[split:]) strfind_result = strfind_tag(tagpair[0], tagpair[1], xmllist) # Hack for alternate way to get cross reference in patent grants in the description element (not sure if interpreting ruby parser's xpath correctly.) if tag != tags.ipg_crossref or (tag == tags.ipg_crossref and strfind_result != None): datalist.append([tag, strfind_result]) else: desc = soup.find('description') if re.sub('[^a-z]', '', desc.find('heading').string.lower()).find('crossref') >= 0: text = desc.string datalist.append([tag, text]) else: datalist.append([tag, 'None']) else: datalist.append([tag, parse_xml(soup, tag)]) return datalist