def __init__(self, path): """ self.unit - the unit to which this Chapter belongs (e.g., 'Pathways & Advance Engineering') self.data - TabData instances for each topic """ self.data = [] s = utils.getHtml(path) filename = os.path.basename(path) self.unit = os.path.basename(os.path.dirname(path)) self.num, self.chapter = self.getChapterInfo(filename) tagPat = RegExUtils.getTagPattern('x:ExcelWorkbook') m = tagPat.search(s) if not m: raise Exception, "could not get TABS data from file (%s)" % path print 'found data' xml = m.group(0).replace('x:', '') # strip x prefix from all elements rec = XmlRecord(xml=xml) rec.xpath_delimiter = '/' tabNodes = rec.selectNodes( rec.dom, "ExcelWorkbook/ExcelWorksheets/ExcelWorksheet") # we ignore the 'Cover sheet' print 'creating %d tabs' % len(tabNodes) for tabElement in tabNodes: tabData = TabData(tabElement, self.unit) if tabData.name.lower() != 'cover sheet': tabData.num = len(self) + 1 self.append(tabData)
def initializeFromBaseMappings(self): baseRec = XmlRecord(path="output/dr_2_recId_mappings.xml") mappingEls = baseRec.selectNodes(baseRec.dom, 'dr_2_recId_mappings:mapping') for mappingEl in mappingEls: drNum = mappingEl.getAttribute('drNumber') recId = mappingEl.getAttribute('recordID') self[drNum] = recId print '%d base mappings found' % len(self)
def initializeFromBaseMappingsBOG(self): baseRec = XmlRecord(path="input/accessionNumberMappings.xml") mappingEls = baseRec.selectNodes(baseRec.dom, 'accessionNumberMappings:mapping') for mappingEl in mappingEls: drNum = mappingEl.getAttribute('drNumber') queryString = mappingEl.getAttribute('queryString') self[drNum] = queryString print '%d base mappings found' % len(self)
def __init__(self): UserDict.__init__(self) rec = XmlRecord('output/FINAL-accessionNumberMappings.xml') mappings = rec.selectNodes(rec.dom, 'accessionNumberMappings:mapping') print '%d mappings found' % len(mappings) for mapping in mappings: drNum = mapping.getAttribute("drNumber") queryString = mapping.getAttribute("queryString") # print '%s -> %s' % (drNum, queryString) self[drNum] = queryString
def __init__ (self, path="output/MetadataModifySpecs.xml"): UserList.__init__ (self) if not os.path.exists(path): raise IOError, "output does not exist at %s" % path updateInfoDoc = XmlRecord(path=path) updateInfos = updateInfoDoc.selectNodes (updateInfoDoc.dom, "changeSpecs:pubNameSpec") print "%d specs found" % len(updateInfos) for info in updateInfos: changeSpec = ChangeSpec (info) print changeSpec self.updateMetadata (changeSpec)
def __init__(self): self.data = {} NsdlSearcher.verbose = False if os.path.exists(idCacheFile): rec = XmlRecord(path=idCacheFile) else: rec = self.getBlankRec() for node in rec.selectNodes(rec.dom, 'idCache:entry'): url = node.getAttribute('url') nsdlId = node.getAttribute('id') self[url] = nsdlId
def getResourceIds (path): rec = XmlRecord(path=path) item_nodes = rec.selectNodes(rec.dom, 'playList:items:item') # print '%d item_nodes found' % len(item_nodes) ids=[];add=ids.append for node in item_nodes: if node.getAttribute('type') == 'ccs_saved_resource': id_node = XmlUtils.getChild ('id', node) add (XmlUtils.getText(id_node)) ids = filter (lambda x:not x.startswith('CCS'), ids) return ids
def getResults(self, params): try: data = self.getData(params=params) except: print 'ERROR: %s' % sys.exc_info()[1] return # print 'DATA: %s' % data response = XmlRecord(xml=data) error = response.selectNodes(response.dom, 'OpenSkyWebService:error') if error: raise Exception, response.getTextAtPath('OpenSkyWebService:error') # Here's where we could ceck for error and raise Exception .. results_path = 'OpenSkyWebService:Search:results:result' results_els = response.selectNodes(response.dom, results_path) print '%d result elements found' % len(results_els) def getResult(node): return ModsRecord(xml=node.toxml()) return map(OSWSResult, results_els)
def __init__(self, path): self.data = {} rec = XmlRecord(path=data) ## print rec rec.xpath_delimiter = "/" nodes = rec.selectNodes(rec.dom, 'GatheredIds/id') self.asnResolutionClient = AsnResolutionClient() print "%d nodes found" % len(nodes) for node in nodes: stdId = node.getAttribute("stdId") docId = node.getAttribute("docId") stdIds = [] if self.has_key(docId): stdIds = self[docId] stdIds.append(stdId) self[docId] = stdIds
class ComparisonManager (UserDict): """ reads cached comparison info from disk """ grouping_data_dir = 'grouping_data' max_dups = 5000 def __init__(self, grouping): self.data = {} path = os.path.join (self.grouping_data_dir, grouping+'Map.xml') self.rec = XmlRecord(path=path) groupNodes = self.rec.selectNodes (self.rec.dom, 'dupGroups:group') print '%d dup nodes found' % len(groupNodes) for groupNode in groupNodes[:self.max_dups]: dupGroup = DupGroup (groupNode) key = dupGroup.key self[key] = dupGroup print 'comparisonManager ingested %d dupGroups' % len(self.keys()) def writeListingHtml (self): """ create an html document that shows the groups and provides access to side-by-side display """ datapath = self.rec.path root, ext = os.path.splitext(os.path.basename (datapath)) self.name = root htmlDoc = DuplicateGroupListingHTML(self, self.name) htmlDoc.write() def writeComparisonPages (self): baseDir = os.path.join ('html', self.name+'_data') if not os.path.exists(baseDir): os.mkdir (baseDir) for key in self.keys(): returnUrl = '../%s.html?groupNum=%s' % (self.name, self[key].groupNum) compareHtml = RecordCompareHtml(self[key], self.name, returnUrl) compareHtml.write (os.path.join (baseDir, self[key].groupNum+'.html')) def keys (self): sorted = self.data.keys() sorted.sort() return sorted
def __init__(self): self.data = {} self.jurisCache = os.path.join(self.cacheBase, 'jurisdictions') self.topicCache = os.path.join(self.cacheBase, 'topics') jurisFiles = filter(lambda x: x.endswith('.xml'), os.listdir(self.jurisCache)) for j in jurisFiles: # print 'processing %s' % j path = os.path.join(self.jurisCache, j) rec = XmlRecord(path=path) asnDocs = map(AsnInfo, rec.selectNodes(rec.dom, 'AsnDocuments:asnDocument')) # print ' ... %d docs found' % len(asnDocs) for asnInfo in asnDocs: topic = asnInfo.topic vals = [] if self.has_key(topic): vals = self[topic] vals.append(asnInfo.element.cloneNode(True)) self[topic] = vals
class RecordDataReader(SortedDict): """ Reads the data in the collection-data file """ data_path = 'not-fy10-records.xml' def __init__(self, acceptFn=None): self.acceptFn = acceptFn is None and self.acceptAll or acceptFn self.read() def read(self): self.data = {} self.data_rec = XmlRecord(path=self.data_path) self.data_rec.xpath_delimiter = "/" recNodes = self.data_rec.selectNodes(self.data_rec.dom, 'not-fy10-records/record') print '%d records read' % len(recNodes) i = 0 for recNode in recNodes: recInfo = RecordInfo(recNode) i = i + 1 if i % 500 == 0: print "%d/%d" % (i, len(recNodes)) if not self.acceptFn(recInfo): continue self.addRecord(recInfo) def acceptAll(self, recInfo): return 1 def acceptFy0809OFF(self, recInfo): """ accept only records with fiscalYear of 2008 or 2009 """ return recInfo.fiscalYear in ['2008', '2009'] def addRecord(self, recInfo): self[recInfo.recId] = recInfo
class CollectionInfo(UserList): # baseDir = "meta-metadata" baseDir = '/home/ostwald/python-lib/ncar_lib/dups/data/meta-metadata' def __init__(self, collection): UserList.__init__(self) self.collection = collection self.dataPath = os.path.join(self.baseDir, collection + '.xml') print "DATA_PATH: ", self.dataPath self.rec = XmlRecord(path=self.dataPath) nodes = self.rec.selectNodes(self.rec.dom, "collectionInfo:rec") print "%d recs read from meta-metadata" % len(nodes) map(self.append, map(RecordInfo, nodes)) def selectByUnionDate(self, unionDate): """ takes union date (e.g., 2011, 2011-02, 2011-02-25) and returns recs having lastTouch AFTER union date """ threshold = unionDateToSecs(unionDate) predicate = lambda x: x.timeStamp >= threshold return self.select(predicate) def select(self, predicate): """ applies predicate to each item returns only those for which preciate is True """ return filter(predicate, self.data) def write(self, path=None): """ self.rec writes to self.rec.path by default """ self.rec.write(path)
def getDrNumbers(self): rec = XmlRecord(path=self.path) mappings = rec.selectNodes(rec.dom, self.mappingsXpath) print '%d mappings found' % len(mappings) return map(lambda x: x.getAttribute("drNumber"), mappings)