def treeToAscii_Elsevier(tree): """ try to convert an elsevier XML file to normal ascii text """ logging.debug("Converting elsevier tree to ascii text") asciiText = "" dp = tree.find("document-properties") if dp!=None: rawTextEl = dp.find("raw-text") if rawTextEl!=None: rawText = rawTextEl.text if rawText!=None: try: asciiText = rawText.encode('latin1').decode('utf8') except UnicodeEncodeError: asciiText = pubGeneric.forceToUnicode(rawText) except UnicodeDecodeError: asciiText = pubGeneric.forceToUnicode(rawText) #logging.debug("ascii is %s" % repr(rawText)) return asciiText, "text/plain" articleEl, articleType = findMainArticleTag(tree) if articleEl is None: return None, None asciiText = pubXml.treeToAsciiText(articleEl, addNewlineTags=elsNewlineTags) return asciiText, "text/xml"
def treeToAscii_Elsevier(tree): """ try to convert an elsevier XML file to normal ascii text """ logging.debug("Converting elsevier tree to ascii text") asciiText = "" dp = tree.find("document-properties") if dp != None: rawTextEl = dp.find("raw-text") if rawTextEl != None: rawText = rawTextEl.text if rawText != None: try: asciiText = rawText.encode('latin1').decode('utf8') except UnicodeEncodeError: asciiText = pubGeneric.forceToUnicode(rawText) except UnicodeDecodeError: asciiText = pubGeneric.forceToUnicode(rawText) #logging.debug("ascii is %s" % repr(rawText)) return asciiText, "text/plain" articleEl, articleType = findMainArticleTag(tree) if articleEl is None: return None, None asciiText = pubXml.treeToAsciiText(articleEl, addNewlineTags=elsNewlineTags) return asciiText, "text/xml"
def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def parseXml(tree, data): """ use elementTree to parse Springer A++, fill dict data with results or None if not succesful """ logging.debug("Parsing Springer fields from tree") hasFulltext = False data["source"] = "springer" journalEl = tree.find("Journal") jiEl = journalEl.find("JournalInfo") data["printIssn"] = findText(jiEl, "JournalPrintISSN") data["eIssn"] = findText(jiEl, "JournalElectronicISSN") data["journal"] = findText(jiEl, "JournalTitle") subjGroupEl = jiEl.find("JournalSubjectGroup") keywords = [] if subjGroupEl != None: for kwEl in subjGroupEl.findall("JournalSubject"): keywords.append(kwEl.text) volEl = journalEl.find("Volume/VolumeInfo") data["vol"] = findText(volEl, "VolumeIDStart") issEl = journalEl.find("Volume/Issue/IssueInfo") data["issue"] = findText(issEl, "IssueIDStart") data["year"] = findText(issEl, "IssueHistory/OnlineDate/Year") if data["year"] == "": data["year"] = findText(issEl, "IssueHistory/PrintDate/Year") if data["year"] == "": data["year"] = findText(issEl, "IssueHistory/CoverDate/Year") artEl = journalEl.find("Volume/Issue/Article/ArticleInfo") doi = findText(artEl, "ArticleDOI") data["doi"] = doi titleEl = artEl.find("ArticleTitle") if titleEl != None: data["title"] = pubXml.treeToAsciiText(titleEl) else: data["title"] = "" data["articleType"] = findText(artEl, "ArticleCategory") if data["articleType"] == None: data["articleType"] = "unknown" data["page"] = findText(artEl, "ArticleFirstPage") springerBaseUrl = "http://link.springer.com/article/" data["fulltextUrl"] = springerBaseUrl + doi headEl = journalEl.find("Volume/Issue/Article/ArticleHeader") if headEl == None: logging.error("No ArticleHeader element") return None auGroupEl = headEl.find("AuthorGroup") names = [] emails = [] for authEl in auGroupEl.findall("Author"): givenNames = [] for givenEl in authEl.findall("AuthorName/GivenName"): givenNames.append(givenEl.text) givenNames = [x for x in givenNames if x != None] givenName = " ".join(givenNames) famName = findText(authEl, "AuthorName/FamilyName") if famName == None: famName = "" name = famName + ", " + givenName names.append(name) emailEl = authEl.find("Contact/Email") if emailEl != None: emails.append(emailEl.text) data["authors"] = "; ".join(names) emails = [e for e in emails if e != None] data["authorEmails"] = "; ".join(emails) abEl = headEl.find("Abstract") if abEl == None: # logging.error("No abstract?") # return None data["abstract"] = "" else: abParts = [] headCount = 0 for childEl in abEl.iter(): if childEl.tag == "Heading": # skip first header, is always "Abstract" if headCount == 0: headCount += 1 continue else: # we are now in some sort of named section if childEl.text != None: abParts.append("<b>" + childEl.text + ": <b>") elif childEl.tag == "Para": abParts.append(pubXml.treeToAsciiText(childEl)) # abParts.append(childEl.text+"<p>") data["abstract"] = "".join(abParts).rstrip("<p>") kwGroupEl = headEl.find("KeywordGroup") # keywords = [] if kwGroupEl != None: for kwEl in kwGroupEl.findall("Keyword"): keywords.append(kwEl.text) keywords = [k.replace(";", ",") for k in keywords if k != None] data["keywords"] = "; ".join(keywords) data["publisher"] = "springer" data["externalId"] = data["doi"] return data
def parseXml(tree, data): """ use elementTree to parse Springer A++, fill dict data with results or None if not succesful """ logging.debug("Parsing Springer fields from tree") hasFulltext = False data["source"] = "springer" journalEl = tree.find("Journal") jiEl = journalEl.find("JournalInfo") data["printIssn"] = findText(jiEl, "JournalPrintISSN") data["eIssn"] = findText(jiEl, "JournalElectronicISSN") data["journal"] = findText(jiEl, "JournalTitle") subjGroupEl = jiEl.find("JournalSubjectGroup") keywords = [] if subjGroupEl!=None: for kwEl in subjGroupEl.findall("JournalSubject"): keywords.append(kwEl.text) volEl = journalEl.find("Volume/VolumeInfo") data["vol"] = findText(volEl, "VolumeIDStart") issEl = journalEl.find("Volume/Issue/IssueInfo") data["issue"] = findText(issEl, "IssueIDStart") data["year"] = findText(issEl, "IssueHistory/OnlineDate/Year") if data["year"]=="": data["year"] = findText(issEl, "IssueHistory/PrintDate/Year") if data["year"]=="": data["year"] = findText(issEl, "IssueHistory/CoverDate/Year") artEl = journalEl.find("Volume/Issue/Article/ArticleInfo") doi = findText(artEl, "ArticleDOI") data["doi"] = doi titleEl = artEl.find("ArticleTitle") if titleEl!=None: data["title"] = pubXml.treeToAsciiText(titleEl) else: data["title"] = "" data["articleType"] = findText(artEl, "ArticleCategory") if data["articleType"]==None: data["articleType"] = "unknown" data["page"] = findText(artEl, "ArticleFirstPage") springerBaseUrl = "http://link.springer.com/article/" data["fulltextUrl"] = springerBaseUrl+doi headEl = journalEl.find("Volume/Issue/Article/ArticleHeader") if headEl==None: logging.error("No ArticleHeader element") return None auGroupEl = headEl.find("AuthorGroup") names = [] emails = [] for authEl in auGroupEl.findall("Author"): givenNames = [] for givenEl in authEl.findall("AuthorName/GivenName"): givenNames.append(givenEl.text) givenNames = [x for x in givenNames if x!=None] givenName = " ".join(givenNames) famName = findText(authEl, "AuthorName/FamilyName") if famName==None: famName = "" name = famName+", "+givenName names.append(name) emailEl = authEl.find("Contact/Email") if emailEl!=None: emails.append(emailEl.text) data["authors"] = "; ".join(names) emails = [e for e in emails if e!=None] data["authorEmails"] = "; ".join(emails) abEl = headEl.find("Abstract") if abEl==None: #logging.error("No abstract?") #return None data["abstract"] = "" else: abParts = [] headCount = 0 for childEl in abEl.iter(): if childEl.tag=="Heading": # skip first header, is always "Abstract" if headCount==0: headCount += 1 continue else: # we are now in some sort of named section if childEl.text!=None: abParts.append("<b>"+childEl.text+": <b>") elif childEl.tag=="Para": abParts.append(pubXml.treeToAsciiText(childEl)) #abParts.append(childEl.text+"<p>") data["abstract"] = "".join(abParts).rstrip("<p>") kwGroupEl = headEl.find("KeywordGroup") #keywords = [] if kwGroupEl!=None: for kwEl in kwGroupEl.findall("Keyword"): keywords.append(kwEl.text) keywords = [k.replace(";", ",") for k in keywords if k!=None] data["keywords"] = "; ".join(keywords) data["publisher"] = "springer" data["externalId"] = data["doi"] return data