Exemple #1
0
    def run(self, oxtFile):
        for file in expectedFiles:
            file = oxtExtract + "/" + file
            if not self.__fs.exists(file):
                print "Error: %s was not found." % file

        if not self.__fs.exists(oxtFile):
            print "Warning: %s not found. created instead." % oxtFile
        self.__fs.zip(oxtFile, oxtExtract)

        for toolbarTextFile, toolbarXbaFile in toolbarTextFiles.items():
            toolbarText = self.__fs.readFile(toolbarTextFile)
            if toolbarText is None:
                print "File '%s' not found!" % self.__fs.absolutePath(toolbarTextFile)
                return
            #check if there weird char in the content. If there is fixed it otherwise it will break the code in the toolbar installer.
            try:
                toolbarText = toolbarText.encode("utf-8")
            except Exception,e:
                print "fileName : ", toolbarTextFile
                print "error in string: ",str(e)
                newText = ""
                t = toolbarText
                for c in t:
                    newC = c
                    if ord(c)>127:
                        print "error char : ",c
                        newC = "&#%s;" % ord(c)
                    newText += newC
                    toolbarText = newText
            xml = xml_util.xml(xbaXmlWrapper)
            xml.getRootNode().setContent(toolbarText)
            xmlStr = str(xml)
            xml.close()
            self.__fs.addToZipFile(oxtFile, toolbarXbaFile, xmlStr)
def test_no_language():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/1/marc.xml")
    tag = createTag(input)
    testDate3 = getTestDate()
    outputTag = testDate3 + "t2002|||||||                 eng|d"
    assert tag == outputTag
def obtain_files(dspace_archive, filename):
    if os.path.exists(dspace_archive):
        archive = dspaceArchive(dspace_archive)
         #loop through items
        for item in archive.items:
            print "Processing Item " + item.name
            fullpath = os.path.join(item.dir, filename)
            xml = xml_util.xml(fullpath)
            fileList = []
            #match all urls 
            nodes = xml.getNodes("//text()[contains(.,'http')]" and "//text()[contains(.,'http')]")
            #qualify urls
            for node in nodes:
                splitNode = node.content
                splitNode = splitNode.split(";")
                url = splitNode[1]
                fileList.append(url)
            #get urls
            for urlItem in fileList:
                harvestUrl = cleanUrl(urlItem)
                print urlItem
                content = getFile(harvestUrl)
                if content != None:
                    harvestedFile = urlItem.rsplit("/")[-1] 
                    item.newStream(harvestedFile, "bundle:ORIGINAL", content)
                else:
                    print "no content"
            xml.close()
def fetchXml(htmlFile):

    createdXml = """<?xml version="1.0"?><collection>"""
    parsedXml = xml_util.xml(createdXml)

    #readFile = getFile(htmlFile)
    webPageXml = xml_util.xml(htmlFile)

    nodes = webPageXml.getNodes("//b")
    print "********************************"
    for node in nodes:
        newElement = parsedXml.createElement(
            repr(node.getContent()), repr(node.getNextSibling().serialize()))
        parsedXml.addChild(newElement)
    file = open("temp.xml", "w")
    file.write(parsedXml.serialize())
    return parsedXml
Exemple #5
0
def test_htmlToXML():
    targetFile = open("xml_data/meta_lowercase.xml", 'r')
    target = targetFile.read()
    htmlFile = open("html_data/thompson-index.html", 'r')
    html = htmlFile.read()
    testThis = xml_util.xml(target) 
    result  = htmlToXML(html)
    assert diff_util.sameXml(target, result)
Exemple #6
0
def test_bodyHtmlToXML():
    targetBodyFile = open("xml_data/body_item.xml", 'r')
    targetBody = targetBodyFile.read()
    htmlFile2 = open("html_data/thompson-index.html", 'r')
    bodyHtml = htmlFile2.read()
    testThis2 = xml_util.xml(targetBody)
    resultBody  = bodyHtmlToXML(bodyHtml)
    assert diff_util.sameXml(targetBody,resultBody)
def test_little_language():
    input = xml_util.xml(
        '<datafield tag="041" ind1=" " ind2=" "><subfield code="a">f</subfield></datafield>'
    )
    outputLanguage = getLanguage(input)
    while len(outputLanguage) < 3:
        # make it 3 characters long
        outputLanguage = outputLanguage + " "
    assert outputLanguage == "f  "
def getMissingDatastreams(file, datastreamId):
    xpath = "//*[local-name()='datastream'][@ID='%s']" % (datastreamId)
    #xpath = "//*[local-name()='datastream'][@ID='FULLTEXT']"
    print xpath
    xml = xml_util.xml(file)
    dsList = xml.getNodes(xpath)
    for d in dsList:
        print d
    return dsList
def test_getPublicationDate5():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/2/marc4.xml")
    date = getPublicationDate(input)
    date2 = getAlternateDate(input)
    date046 = getPublicationDate046(input)
    assert date == "1260"
    assert date2 == "1963"
    assert date046 == "1952"
def cleanTitle(input):
    xml = xml_util.xml(input)
    titleXml = xml.getNodes(
        "//*[local-name()='datafield'][@tag='245']/*[local-name()='subfield'][@code='a']"
    )
    for node in titleXml:
        print node.content
        titleString = node.content
        titleString = titleString.rstrip(":")
        node.setContent(titleString)
    xml.saveFile(input)
    xml.close()
def cleanKeyword(input):
    xml = xml_util.xml(input)
    keywordXml = xml.getNodes(
        "//*[local-name()='datafield'][@tag='650']/*[local-name()='subfield'][@code='x']"
    )
    for node in keywordXml:
        print node.content
        string1 = node.content
        cleanString = string.replace(string1, "]", "")
        cleanString2 = string.replace(cleanString, ".", "")
        node.setContent(cleanString2)
    xml.saveFile(input)
    xml.close()
Exemple #12
0
def getIdentifier(dcString):

    dcxml = xml_util.xml(dcString,[("dc","http://purl.org/dc/elements/1.1/"),("xsi","http://www.w3.org/2001/XMLSchema-instance")])
    try:
        pageUrl = dcxml.getNode("//dc:identifier").content
        return pageUrl
    
    #[@xsi:type='dcterms:URI']


    except Exception, errorInfo:
        print errorInfo
        print "Unable to find contents"
        return None
def iterate(archiveName):
    arc = dspaceArchive(archiveName)
    for item in arc.items:
        print item.name + " is being processed"
        #fileContents = item.readFile("marc.xml")
        x = item.getRelPathToStream('marc.xml')
        fullPath = os.path.join(archiveName, x)
        print fullPath
        input = xml_util.xml(fullPath)
        print input
        tag = createTag(input)
        node = input.getNode("//*[local-name()='controlfield'][@tag='008']")
        node.setContent(tag)
        input.saveFile(fullPath)
        print item.name + " controlfield tag 008, update complete"
        input.close()
    print "Creating of marc controlfield tag[s] is complete"
def addMarcTag(input, recordType):
    file = open(input, 'rb')
    readFile = file.read()
    splitMarc = readFile.split("</record>")
    marcString = None
    if recordType == "B":
        marcString = """<datafield tag="655" ind1=" " ind2="7">
        <subfield code="a">Brunner digitised document</subfield>
        <subfield code="2">local</subfield>
        </datafield>
        <datafield tag="540" ind1=" " ind2=" ">
        <subfield code="a">PART III. After reasonable investigation, this material has been reproduced in reliance on Part III of the Australian Copyright Act 1968. The electronic form of this material is Copyright Macquarie University, Sydney. Please contact the Macquarie University Copyright Unit with inquiries www.copyright.mq.edu.au</subfield>
        </datafield>"""

    if recordType == "E":
        tagPresent = False
        xml = xml_util.xml(input)
        nodes = xml.getNodes(
            "//*[local-name()='datafield'][@tag='655']/*[local-name()='subfield'][@code='a']"
        )

        for node in nodes:
            if (node.getContent == "Australasian Digital Thesis") != -1:
                tagPresent = True
        if tagPresent == False:
            marcString = """<datafield tag="655" ind1=" " ind2=" ">
            <subfield code="a">Australasian Digital Thesis</subfield>
            </datafield>"""
        xml.close()

    if recordType == "W":
        marcString = """<datafield tag="540" ind1=" " ind2=" ">
        <subfield code="a">Permission for use provided to the Macquarie University Digital Repository by the publisher.</subfield>
        </datafield>"""

    if recordType == "L":
        marcString = """<datafield tag="540" ind1=" " ind2=" ">
        <subfield code="a">*</subfield>
        </datafield>"""
    if marcString != None:
        builtString = splitMarc[0] + marcString + "</record>" + splitMarc[1]
        newFile = open(input, 'wb')
        newFile.write(builtString)
        newFile.close()
    file.close()
def obtain_files(dspace_archive,
                 filename,
                 fileType,
                 protocol="false",
                 username="******",
                 password="******"):

    if os.path.exists(dspace_archive):
        archive = dspaceArchive(dspace_archive)
        #loop through items
        for item in archive.items:
            print "Processing Item " + item.name
            fullpath = os.path.join(item.dir, filename)
            xml = xml_util.xml(fullpath)
            fileList = []

            #match all urls
            nodes = xml.getNodes("//text()[starts-with(.,'http')]")

            #qualify urls

            for node in nodes:
                if node.content.endswith(fileType):
                    fileList.append(node.content)
            #get urls

            for urlItem in fileList:
                harvestUrl = cleanUrl(urlItem)
                if protocol != "false":
                    Content = getFile(harvestUrl, protocol, username, password)
                else:
                    Content = getFileNoAuth(harvestUrl)
                if Content != None:
                    pdfFileName = get_harvestedFileName(urlItem)
                    item.newStream(pdfFileName, "bundle:ORIGINAL", Content)
                else:
                    print "no content"

            xml.close()
Exemple #16
0
def removeXmlNode(dspaceArchiveName, filename, xpath):
    print dspaceArchiveName
    if os.path.exists(dspaceArchiveName):
        arc = dspaceArchive(dspaceArchiveName)

        for item in arc.items:
            print "Processing " + item.name + "."
            filePath = os.path.join(item.dir, filename)
            print filePath
            xml = xml_util.xml(filePath)
            nodeToDelete = xml.getNodes(xpath)
            for node in nodeToDelete:
                nodeContent = node.getContent()
                if nodeContent.find("/public/") != -1 or nodeContent.find(
                        "ethesis.php") != -1:
                    try:
                        node.delete()
                        print "Successfully removed node"
                        xml.saveFile()
                    except Exception, errorInfo:
                        print errorInfo
                        print "The xpath " + xpath + " did not match a node in " + filePath + "."
            print "Processing complete."
            xml.close()
def test_getPublicationDate6():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/2/marc5.xml")
    date = getAlternateDate(input)

    assert date == "1963"
def test_createTag2():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/2/marc4.xml")
    tag = createTag(input)
    testDate2 = getTestDate()
    assert tag == testDate2 + "m12601963|||                 eng|d"
def test_createTag3():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/2/marc2.xml")
    tag1 = createTag(input)
    testDate3 = getTestDate()
    assert tag1 == testDate3 + "n|||||||||||                 eng|d"
def test_createTag():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/0/marc.xml")
    tag = createTag(input)
    testDate2 = getTestDate()
    assert tag == testDate2 + "t2002|||||||                 fr |d"
Exemple #21
0
def obtainFiles(dspace_archive, filename, pathToDeadLinksFile,  protocol="false", username="******", password="******"):
    #create new file for dead links
    downloadCounter = 0
    deadLinkReport = DeadLinksFile(pathToDeadLinksFile)
    if os.path.exists(dspace_archive):
        archive = dspaceArchive(dspace_archive)
        #loop through items
        for item in archive.items:
            print "Processing Item " + item.name
            fullPath = os.path.join(item.dir, filename)
            xml = xml_util.xml(fullPath)
            fileList = []
            
            #match all urls 
            nodes = xml.getNodes("//text()[contains(.,'http')]")
            sessionNode = xml.getNode("//session")
            sessionNodeContent= sessionNode.content
            
            
            #qualify urls and store in list
            for node in nodes:
                if node.content.startswith("http://") or node.content.startswith("https://"):
                    fileList.append(node.content)
                else:
                    deadLinkReport.reportToScreen(node.content)
                    deadLinkReport.addDeadLinkT(node.content)
                    print "\n\n"
                                    
            #iterate through list of urls
            for urlItem in fileList:
                harvestUrl = cleanUrl(urlItem)
                #determine if link is a downloadable non xml datastream or just a html page
                isHtml = "false"
                isHtml = determineMimeType(harvestUrl)
                content = None  
                if isHtml == "false":
                    #determine if user has entered authentication for downloading datastreams using basic auth
                    if protocol != "false":
                        content= getFile(harvestUrl, protocol, username, password) 
                    else:
                        if harvestUrl.startswith("https://"):
                            content= getFileHttps(harvestUrl)
                        else:
                            content= getFileNoAuth(harvestUrl)
                else:
                    content = getRedirectedFile(harvestUrl)
                    
                if content!= None:
                    harvestedFileName = getHarvestedFileName(harvestUrl)
                    #shorten filename so that file system does not complain
                    correctlySizedFileName = shortenFileName(harvestedFileName)
                    #add datastream to dspace archive
                    item.newStream(correctlySizedFileName, "bundle:ORIGINAL", content)
                    #append valet xml so that it knows about the new datastream 
                    addAttachmentDataToValetXml(xml, correctlySizedFileName, fullPath)
                    #increment the downloadCounter 
                    downloadCounter = downloadCounter + 1
                else:
                    deadLinkReport.reportToScreen(harvestUrl)
                    deadLinkReport.addDeadLinkT(harvestUrl)
                    print "\n\n"
            
            xml.close()
            deadLinkReport.report(sessionNodeContent, item.name)
            deadLinkReport.reset()
        print "\n\n"
    deadLinkReport.closeFile()
    print "total number of downloads = "
    print downloadCounter
def test_getPublicationDate4():
    input = xml_util.xml(
        "create_marc_controlfield_tag_008_test_archive/2/marc3.xml")
    date = getPublicationDate(input)
    assert date == "1234"
xsl/marc_dc.xsl dublin_core.xml dspaceArchive False
"""

import libxml2, urllib2, urlparse, sys, os, os.path, subprocess, re, unicodedata

sys.path.append("utils")
sys.path.append("dspace_archive")
import diff_util, xml_util, xslt_util
from dspace_archive import *

#load existing Archive
inputFileName = sys.argv[1]
XslFilePath = sys.argv[2]
outputFileName = sys.argv[3]
TargetArchiveName = sys.argv[4]
removeInputFileAfterTransform = sys.argv[5]
archive = dspaceArchive(TargetArchiveName)
for item in archive.items:
    print "Processing item: " + item.name
    metaString = item.readFile(inputFileName)
    meta = xml_util.xml(metaString)
    xslt = xslt_util.xslt(XslFilePath)
    temp = meta.applyXslt(xslt)
    dublinCore = str(temp)
    temp.close()
    meta.close()
    xslt.close()
    item.setDublinCoreStream(dublinCore, inputFileName, outputFileName,
                             removeInputFileAfterTransform)
print "Transformation complete"