Example #1
0
def split_xml(fulldoc, max_iter=(-1)):
    xml = []
    lnum = 1
    n_iter = 1
    print max_iter
    print 'Splitting xml, please wait...'
    
    found = False
    for line in fulldoc:
        enclosing = tags.getEnclosing()
        if line.find(formatTag(enclosing)[:-1]) >= 0:
            found = True

        if found: # Sometimes there is data outside the ipa_enclosing tags which messes up the parser.
            xml.append(line)

        if (line.strip().find(formatTag(enclosing, True)) >= 0):
            found = False
            # Clone the list and append it to xmldocs
            xmldocs.append(list(xml))
            # Write to file (should be commmented out, for debugging purposes
            #f = open(getwd() + '/output.csv', 'a') 
            #f.write(''.join(xml))
            n_iter += 1
            xml = []
            patutil.print_over("\rSplit %d on line %d ..." % (n_iter, lnum))
            if max_iter >= 0 and n_iter > max_iter:
                break

        lnum += 1
            
    print 'Done with length %d.' % len(xmldocs)
Example #2
0
def split_xml(fulldoc, max_iter=(-1)):
    xml = []
    lnum = 1
    n_iter = 1
    print max_iter
    print 'Splitting xml, please wait...'
    
    found = False
    for line in fulldoc:
        enclosing = tags.getEnclosing()
        if line.find(formatTag(enclosing)[:-1]) >= 0:
            found = True

        if found: # Sometimes there is data outside the ipa_enclosing tags which messes up the parser.
            xml.append(line)

        if (line.strip().find(formatTag(enclosing, True)) >= 0):
            found = False
            # Clone the list and append it to xmldocs
            xmldocs.append(list(xml))
            # Write to file (should be commmented out, for debugging purposes
            #f = open(getwd() + '/output.csv', 'a') 
            #f.write(''.join(xml))
            n_iter += 1
            xml = []
            patutil.print_over("\rSplit %d on line %d ..." % (n_iter, lnum))
            if max_iter >= 0 and n_iter > max_iter:
                break

        lnum += 1
            
    print 'Done with length %d.' % len(xmldocs)
Example #3
0
def parse_xml(soup, tag):
    global tags
    finaltag = None #The tag object which will be printed or returned at the end of the scrape
    result = 'None'
    patutil.print_over('\rScraping tag %s.' % (tag))
    # (Re)sets subsoup to the top of the xml tree
    enclosing = tags.getEnclosing() 
    subsoup = soup.find(enclosing)
    tagtree = tag.split('/')
    #print 'tagtree length:', len(tagtree)
    for i in xrange(0, len(tagtree)):
        if subsoup == None:
            #print 'WARNING: \'' + tagtree[i - 1] + '\' is none in tag tree:', ', '.join(tagtree)
            result = 'None'
            break

        elif i < len(tagtree) - 1: # If not at the end of the tree
            subsoup = subsoup.find(tagtree[i])

        else: # If at the end of the tree (or if the tree only has one element)
            finaltag = subsoup.find(tagtree[i])
            result = tagString(finaltag)

            # Below methods assume that the ipa and ipg variables being worked with are the same
            # Add special formatting for inventors tag
            if tag == tags.ipa_inventors:
                templist = []
                if finaltag != None:
                    for name in finaltag.find_all('addressbook'):
                        #print name
                        templist.append('[')
                        # Only append if tag contains name (first-name), (last-name), etc.
                        fname = name.find('first-name')
                        mname = name.find('middle-name')
                        lname = name.find('last-name')
                        if fname != None:
                            templist.append(fname.string)
                            if (mname != None):
                                templist.append(' ' + mname.string)
                            templist.append(' ')
                        if lname != None:
                            templist.append(lname.string)
                        templist.append(']')
                
                    result = ''.join(templist)
            elif tag == tags.ipa_assignee:
                templist = []
                if finaltag != None:
                    for name in finaltag.find_all('addressbook'):
                        orgname = name.find('orgname')
                        if orgname != None:
                            templist.append('[' + orgname.string + ']')
                
                    result = ''.join(templist)
    #print type(result), result
    return unicode(result)
Example #4
0
def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = (count * block_size)
    speed = int(progress_size / (1024 * duration))
    percent = int(count * block_size * 100 / total_size)
    patutil.print_over("\rProgress: %d%%, %.2f %.2f MB, %d KB/s, %d seconds passed" % (percent, float(progress_size) / (1024 * 1024), float(total_size) / (1024 * 1024), speed, duration))
Example #5
0
def scrape(xmllist, nonsf_flag=False):
    patutil.print_over("\rScraping %s of %s." % (xmliteration + 1, len(xmldocs)))
    
    # Gets the government interest field and looks for NSF or national science foundation
    if (get_govt_interest(xmllist)):
        print 'Found NSF reference, adding to CSV. <!!!!!!!!!!!'
    elif not nonsf_flag:
        return 
    xml = ''.join(xmllist)

    # Create a string from the singular xml list created in split_xml()
    if patutil.cmd_args['dump_flag']:
        patutil.dump_xml(xml, str(xmliteration) + '.xml')

    # Begin the parse
    soup = BeautifulSoup(xml, ["lxml", "xml"])
    datalist = []

    global tags
    for tag in tags.getTags(year):
        # Non bs4 parsing
        if (tag[0:2] == '<?'):
            # Split start and end tags
            split = tag.find('>') + 1
            tagpair = (tag[0:split], tag[split:])
            strfind_result = strfind_tag(tagpair[0], tagpair[1], xmllist)

            # Hack for alternate way to get cross reference in patent grants in the description element (not sure if interpreting ruby parser's xpath correctly.)
            if tag != tags.ipg_crossref or (tag == tags.ipg_crossref and strfind_result != None):
                datalist.append([tag, strfind_result])
            else:
                desc = soup.find('description')
                if re.sub('[^a-z]', '', desc.find('heading').string.lower()).find('crossref') >= 0:
                    text = desc.string
                    print text
                    datalist.append([tag, text])
                else: datalist.append([tag, 'None'])
        else:
            datalist.append([tag, parse_xml(soup, tag)])

    return datalist
Example #6
0
def scrape(xmllist, nonsf_flag=False):
    patutil.print_over("\rScraping %s of %s." % (xmliteration + 1, len(xmldocs)))
    
    # Gets the government interest field and looks for NSF or national science foundation
    if (get_govt_interest(xmllist)):
        print 'Found NSF reference, adding to CSV. <!!!!!!!!!!!'
    elif not nonsf_flag:
        return 
    xml = ''.join(xmllist)

    # Create a string from the singular xml list created in split_xml()
    if patutil.cmd_args['dump_flag']:
        patutil.dump_xml(xml, str(xmliteration) + '.xml')

    # Begin the parse
    soup = BeautifulSoup(xml, ["lxml", "xml"])
    datalist = []

    global tags
    for tag in tags.getTags(year):
        # Non bs4 parsing
        if (tag[0:2] == '<?'):
            # Split start and end tags
            split = tag.find('>') + 1
            tagpair = (tag[0:split], tag[split:])
            strfind_result = strfind_tag(tagpair[0], tagpair[1], xmllist)

            # Hack for alternate way to get cross reference in patent grants in the description element (not sure if interpreting ruby parser's xpath correctly.)
            if tag != tags.ipg_crossref or (tag == tags.ipg_crossref and strfind_result != None):
                datalist.append([tag, strfind_result])
            else:
                desc = soup.find('description')
                if re.sub('[^a-z]', '', desc.find('heading').string.lower()).find('crossref') >= 0:
                    text = desc.string
                    datalist.append([tag, text])
                else: datalist.append([tag, 'None'])
        else:
            datalist.append([tag, parse_xml(soup, tag)])

    return datalist