Esempio n. 1
0
def treeToAscii_Elsevier(tree):
    """ try to convert an elsevier XML file to normal ascii text """
    logging.debug("Converting elsevier tree to ascii text")
    asciiText = ""
    dp = tree.find("document-properties")
    if dp!=None:
        rawTextEl = dp.find("raw-text")
        if rawTextEl!=None:
            rawText = rawTextEl.text
            if rawText!=None:
                try:
                    asciiText = rawText.encode('latin1').decode('utf8')
                except UnicodeEncodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                except UnicodeDecodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                #logging.debug("ascii is %s" % repr(rawText))
                return asciiText, "text/plain"

    articleEl, articleType = findMainArticleTag(tree)
    if articleEl is None:
        return None, None

    asciiText = pubXml.treeToAsciiText(articleEl, addNewlineTags=elsNewlineTags)
    return asciiText, "text/xml"
Esempio n. 2
0
def treeToAscii_Elsevier(tree):
    """ try to convert an elsevier XML file to normal ascii text """
    logging.debug("Converting elsevier tree to ascii text")
    asciiText = ""
    dp = tree.find("document-properties")
    if dp != None:
        rawTextEl = dp.find("raw-text")
        if rawTextEl != None:
            rawText = rawTextEl.text
            if rawText != None:
                try:
                    asciiText = rawText.encode('latin1').decode('utf8')
                except UnicodeEncodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                except UnicodeDecodeError:
                    asciiText = pubGeneric.forceToUnicode(rawText)
                #logging.debug("ascii is %s" % repr(rawText))
                return asciiText, "text/plain"

    articleEl, articleType = findMainArticleTag(tree)
    if articleEl is None:
        return None, None

    asciiText = pubXml.treeToAsciiText(articleEl,
                                       addNewlineTags=elsNewlineTags)
    return asciiText, "text/xml"
Esempio n. 3
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 4
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 5
0
def parseXml(tree, data):
    """
    use elementTree to parse Springer A++, fill dict data with results or None if not succesful
    """
    logging.debug("Parsing Springer fields from tree")
    hasFulltext = False

    data["source"] = "springer"
    journalEl = tree.find("Journal")
    jiEl = journalEl.find("JournalInfo")
    data["printIssn"] = findText(jiEl, "JournalPrintISSN")
    data["eIssn"] = findText(jiEl, "JournalElectronicISSN")
    data["journal"] = findText(jiEl, "JournalTitle")
    subjGroupEl = jiEl.find("JournalSubjectGroup")

    keywords = []
    if subjGroupEl != None:
        for kwEl in subjGroupEl.findall("JournalSubject"):
            keywords.append(kwEl.text)

    volEl = journalEl.find("Volume/VolumeInfo")
    data["vol"] = findText(volEl, "VolumeIDStart")

    issEl = journalEl.find("Volume/Issue/IssueInfo")
    data["issue"] = findText(issEl, "IssueIDStart")
    data["year"] = findText(issEl, "IssueHistory/OnlineDate/Year")
    if data["year"] == "":
        data["year"] = findText(issEl, "IssueHistory/PrintDate/Year")
    if data["year"] == "":
        data["year"] = findText(issEl, "IssueHistory/CoverDate/Year")

    artEl = journalEl.find("Volume/Issue/Article/ArticleInfo")
    doi = findText(artEl, "ArticleDOI")
    data["doi"] = doi
    titleEl = artEl.find("ArticleTitle")
    if titleEl != None:
        data["title"] = pubXml.treeToAsciiText(titleEl)
    else:
        data["title"] = ""

    data["articleType"] = findText(artEl, "ArticleCategory")
    if data["articleType"] == None:
        data["articleType"] = "unknown"

    data["page"] = findText(artEl, "ArticleFirstPage")

    springerBaseUrl = "http://link.springer.com/article/"
    data["fulltextUrl"] = springerBaseUrl + doi

    headEl = journalEl.find("Volume/Issue/Article/ArticleHeader")
    if headEl == None:
        logging.error("No ArticleHeader element")
        return None

    auGroupEl = headEl.find("AuthorGroup")
    names = []
    emails = []
    for authEl in auGroupEl.findall("Author"):
        givenNames = []
        for givenEl in authEl.findall("AuthorName/GivenName"):
            givenNames.append(givenEl.text)
        givenNames = [x for x in givenNames if x != None]
        givenName = " ".join(givenNames)
        famName = findText(authEl, "AuthorName/FamilyName")
        if famName == None:
            famName = ""
        name = famName + ", " + givenName
        names.append(name)

        emailEl = authEl.find("Contact/Email")
        if emailEl != None:
            emails.append(emailEl.text)
    data["authors"] = "; ".join(names)

    emails = [e for e in emails if e != None]
    data["authorEmails"] = "; ".join(emails)

    abEl = headEl.find("Abstract")
    if abEl == None:
        # logging.error("No abstract?")
        # return None
        data["abstract"] = ""
    else:
        abParts = []
        headCount = 0
        for childEl in abEl.iter():
            if childEl.tag == "Heading":
                # skip first header, is always "Abstract"
                if headCount == 0:
                    headCount += 1
                    continue
                else:
                    # we are now in some sort of named section
                    if childEl.text != None:
                        abParts.append("<b>" + childEl.text + ": <b>")
            elif childEl.tag == "Para":
                abParts.append(pubXml.treeToAsciiText(childEl))
                # abParts.append(childEl.text+"<p>")
        data["abstract"] = "".join(abParts).rstrip("<p>")

    kwGroupEl = headEl.find("KeywordGroup")
    # keywords = []
    if kwGroupEl != None:
        for kwEl in kwGroupEl.findall("Keyword"):
            keywords.append(kwEl.text)
    keywords = [k.replace(";", ",") for k in keywords if k != None]
    data["keywords"] = "; ".join(keywords)

    data["publisher"] = "springer"
    data["externalId"] = data["doi"]
    return data
Esempio n. 6
0
def parseXml(tree, data):
    """
    use elementTree to parse Springer A++, fill dict data with results or None if not succesful
    """
    logging.debug("Parsing Springer fields from tree")
    hasFulltext = False

    data["source"]          = "springer"
    journalEl = tree.find("Journal")
    jiEl = journalEl.find("JournalInfo")
    data["printIssn"]       = findText(jiEl, "JournalPrintISSN")
    data["eIssn"]           = findText(jiEl, "JournalElectronicISSN")
    data["journal"]         = findText(jiEl, "JournalTitle")
    subjGroupEl = jiEl.find("JournalSubjectGroup")

    keywords = []
    if subjGroupEl!=None:
        for kwEl in subjGroupEl.findall("JournalSubject"):
            keywords.append(kwEl.text)

    volEl = journalEl.find("Volume/VolumeInfo")
    data["vol"]             = findText(volEl, "VolumeIDStart")

    issEl = journalEl.find("Volume/Issue/IssueInfo")
    data["issue"]           = findText(issEl, "IssueIDStart")
    data["year"]            = findText(issEl, "IssueHistory/OnlineDate/Year")
    if data["year"]=="":
        data["year"]        = findText(issEl, "IssueHistory/PrintDate/Year")
    if data["year"]=="":
        data["year"]        = findText(issEl, "IssueHistory/CoverDate/Year")

    artEl = journalEl.find("Volume/Issue/Article/ArticleInfo")
    doi = findText(artEl, "ArticleDOI")
    data["doi"]             = doi
    titleEl = artEl.find("ArticleTitle")
    if titleEl!=None:
        data["title"]           = pubXml.treeToAsciiText(titleEl)
    else:
        data["title"] = ""

    data["articleType"]     = findText(artEl, "ArticleCategory")
    if data["articleType"]==None:
        data["articleType"] = "unknown"

    data["page"]            = findText(artEl, "ArticleFirstPage")

    springerBaseUrl = "http://link.springer.com/article/"
    data["fulltextUrl"]     = springerBaseUrl+doi

    headEl = journalEl.find("Volume/Issue/Article/ArticleHeader")
    if headEl==None:
        logging.error("No ArticleHeader element")
        return None

    auGroupEl = headEl.find("AuthorGroup")
    names = []
    emails = []
    for authEl in auGroupEl.findall("Author"):
        givenNames = []
        for givenEl in authEl.findall("AuthorName/GivenName"):
            givenNames.append(givenEl.text)
        givenNames = [x for x in givenNames if x!=None]
        givenName = " ".join(givenNames)
        famName = findText(authEl, "AuthorName/FamilyName")
        if famName==None:
            famName = ""
        name = famName+", "+givenName
        names.append(name)

        emailEl = authEl.find("Contact/Email")
        if emailEl!=None:
            emails.append(emailEl.text)
    data["authors"] = "; ".join(names)

    emails = [e for e in emails if e!=None]
    data["authorEmails"] = "; ".join(emails)

    abEl = headEl.find("Abstract")
    if abEl==None:
        #logging.error("No abstract?")
        #return None
        data["abstract"] = ""
    else:
        abParts = []
        headCount = 0
        for childEl in abEl.iter():
            if childEl.tag=="Heading":
                # skip first header, is always "Abstract"
                if headCount==0:
                    headCount += 1
                    continue
                else:
                    # we are now in some sort of named section
                    if childEl.text!=None:
                        abParts.append("<b>"+childEl.text+": <b>")
            elif childEl.tag=="Para":
                abParts.append(pubXml.treeToAsciiText(childEl))
                #abParts.append(childEl.text+"<p>")
        data["abstract"] = "".join(abParts).rstrip("<p>")

    kwGroupEl = headEl.find("KeywordGroup")
    #keywords = []
    if kwGroupEl!=None:
        for kwEl in kwGroupEl.findall("Keyword"):
            keywords.append(kwEl.text)
    keywords = [k.replace(";", ",") for k in keywords if k!=None]
    data["keywords"] = "; ".join(keywords)

    data["publisher"] = "springer"
    data["externalId"] = data["doi"]
    return data