Esempio n. 1
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error

    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""

    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 2
0
def convertHtmlToDicts(url, content):
    """ given a url and content, create file and article dictionaries 
    content has to include normal newlines, no \a or #N# replacers

    returns None, None on error
    
    """
    # lxml does not like unicode if the document has an explicit encoding
    if " encoding=" not in content:
        content = pubGeneric.forceToUnicode(content)
    logging.debug("Converting to text: %s " % (repr(url)))
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url)

    if not "<html" in content:
        return None, None

    try:
        logging.debug("Parsing html with lxml, html size %d" % len(content))
        tree = lxml.html.document_fromstring(content)
        logging.debug("end parse html")
    except lxml.etree.XMLSyntaxError:
        return None, None

    titleEl = tree.find("head/title")
    if titleEl!=None:
        title = titleEl.text
    else:
        logging.debug("No title found?")
        title = ""
        
    metaTags = tree.findall("head/meta")
    artDict = parseMetaData(metaTags, artDict)
    logging.debug("Cleaning html tree")
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.meta = True
    cleaner.embedded = True
    cleaner.page_structure=True 
    #cleaner.remove_tags = ["a", "li", "td"]
    cleanTree = cleaner.clean_html(tree)
    logging.debug("Cleaning done, now converting to ASCII")
    #text = cleanTree.text_content()
    newlineTags = ["p", "br"]
    asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags)
    logging.debug("ASCII conversion done")
    logging.debug("title: %s" % title)

    if "title" not in artDict or artDict["title"]=="":
        artDict["title"] = title

    if artDict["abstract"]=="":
        abstract = unidecode.unidecode(asciiText[0:1500]).strip()
        artDict["abstract"] = abstract

    logging.debug("abstract: %s" % artDict["abstract"])
    fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html")
    logging.debug("meta data extract success: %s" % artDict)
    return artDict, fileDict
Esempio n. 3
0
def convertOneChunk(inIndexFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    store = pubStore.PubWriterFile(outFile)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    doi2pmid = None
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    for row in inRows:
        # read line
        i+=1
        articleId, baseDir = row.articleId, row.baseDir
        zipFilename, filename = row.zipFilename, row.filename
        articleId=int(articleId)

        # open file from zipfile
        fullZipPath = join(baseDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        if doi2pmid==None:
            doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        xmlTree   = pubXml.etreeFromXml(xmlString)

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"]="consyn://"+zipFilename+"/"+filename
        if articleData["doi"] in doi2pmid:
           articleData["pmid"] = doi2pmid[articleData["doi"]]

        pii = splitext(basename(filename))[0]
        articleData["externalId"]="PII"+pii
        articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files" % convCount)
    store.close()
Esempio n. 4
0
def minimalHtmlToDicts(url, content):
    " a minimalistic article dict filler, does not try to parse the html "
    logging.debug("Falling back to minimal html to text")
    fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html")
    fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html")
    if fileDict==None or not "content" in fileDict:
        return None, None
    text = fileDict["content"]
    title = unidecode.unidecode(content[:100])
    abstract = unidecode.unidecode(content[100:1000])
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \
        title=title, abstract=abstract, externalId=url)
    #if fileDict==None: #continue
    return artDict, fileDict
Esempio n. 5
0
def minimalHtmlToDicts(url, content):
    " a minimalistic article dict filler, does not try to parse the html "
    logging.debug("Falling back to minimal html to text")
    fileDict = pubStore.createEmptyFileDict(url=url, content=content, mimeType="text/html")
    fileDict = pubGeneric.toAsciiEscape(fileDict, mimeType="text/html")
    if fileDict==None or not "content" in fileDict:
        return None, None
    text = fileDict["content"]
    title = unidecode.unidecode(content[:100])
    abstract = unidecode.unidecode(content[100:1000])
    artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url, \
        title=title, abstract=abstract, externalId=url)
    #if fileDict==None: #continue
    return artDict, fileDict
Esempio n. 6
0
def parseMedline(xmlParser):
    """
    fill article data dict with pubmed xml data

    >>> xml = PubmedTestDoc()
    >>> data = parseMedline(maxXml.XmlParser(string=xml))
    >>> del data["time"]
    >>> repr(data)
     "OrderedDict([('articleId', ''), ('externalId', 'PMID20430833'), ('source', ''), ('origFile', ''), ('journal', 'Brain : a journal of neurology'), ('printIssn', '0006-8950'), ('eIssn', '0006-8950'), ('journalUniqueId', '0372537'), ('year', '2010'), ('articleType', 'research-article'), ('articleSection', ''), ('authors', u'Willemsen, Mich\\\\xe9l A; Verbeek, Marcel M'), ('authorEmails', ''), ('authorAffiliations', 'Radboud University Nijmegen Medical Centre, Donders Institute for Brain, Cognition and Behaviour, Department of Paediatric Neurology (820 IKNC), PO Box 9101, 6500 HB Nijmegen, The Netherlands. [email protected]'), ('keywords', 'Age of Onset/Useless Research'), ('title', 'Tyrosine hydroxylase deficiency: a treatable disorder of brain catecholamine biosynthesis.'), ('abstract', 'An infantile onset, progressive, hypokinetic-rigid syndrome with dystonia (type A), and a complex encephalopathy with neonatal onset (type B). Decreased cerebrospinal fluid concentrations of homovanillic acid and c.698G>A and c.707T>C mutations. Carriership of at least one promotor mutation, however, apparently predicts type A tyrosine hydroxylase deficiency. Most patients with tyrosine hydroxylase deficiency can be successfully treated with l-dopa.'), ('vol', '133'), ('issue', 'Pt 6'), ('page', '1810-22'), ('pmid', '20430833'), ('pmcId', ''), ('doi', ''), ('fulltextUrl', 'https://www.ncbi.nlm.nih.gov/pubmed/20430833')])"

    """
    data = pubStore.createEmptyArticleDict()
    #medlineData           = xmlParser.getXmlFirst("MedlineCitation")
    medlineData           = xmlParser
    data["pmid"]          = medlineData.getTextFirst("PMID")
    el = medlineData.getElFirst("PMID", None)
    data["pmidVersion"]   = el.attrib.get("Version", "") if el is not None else ""
    data["externalId"]    = "PMID"+data["pmid"]
    data["fulltextUrl"]   = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % data["pmid"]
    logging.log(5, "PMID %s" % data["pmid"])
    data["medlineCreatedDate"] = getMedlineDate(medlineData, "DateCreated")
    data["medlineCompletedDate"] = getMedlineDate(medlineData, "DateCompleted")
    data["medlineRevisedDate"] = getMedlineDate(medlineData, "DateRevised")
    otherIds         = medlineData.getTextAll("OtherID", reqAttrDict={"Source":"NLM"})
    pmcIds = [i for i in otherIds if i.startswith("PMC")]
    if len(pmcIds) > 0:
        data["pmcId"] = pmcIds[0].split()[0].replace("PMC","")

    artTree               = medlineData.getXmlFirst("Article")

    data["title"]         = getMedlineText(artTree.getXmlAll("ArticleTitle"))

    # handle structured abstracts
    data["abstract"]      = getMedlineText(artTree.getXmlAll("Abstract/AbstractText"))

    if data["abstract"]=="":
        data["abstract"]      = getMedlineText(artTree.getXmlAll("OtherAbstract/AbstractText"))

    data["authorAffiliations"]   = artTree.getTextFirst("Affiliation", default="")
    data["doi"]           = artTree.getTextFirst("ELocationID", default="", reqAttrDict={"EIdType":"doi"})
    data["lang"]   = artTree.getTextFirst("Language", default="")

    data["journalUniqueId"] = medlineData.getTextFirst("MedlineJournalInfo/NlmUniqueID")
    linkingIssn = medlineData.getTextFirst("MedlineJournalInfo/ISSNLinking", default="")

    journalTree = artTree.getXmlFirst("Journal")
    data["eIssn"]       = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": 'Electronic'}, default="")
    data["printIssn"]   = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": 'Print'}, default="")
    # keep the link ISSN when we have space, e.g. PNAS is not storing the print ISSN anymore, only as link Issn
    if data["printIssn"]=="" and linkingIssn!="":
        data["printIssn"]   = linkingIssn
    if data["eIssn"]=="" and linkingIssn!="":
        data["eIssn"]   = linkingIssn

    data["vol"]         = journalTree.getTextFirst("JournalIssue/Volume", default="")
    data["issue"]       = journalTree.getTextFirst("JournalIssue/Issue", default="")
    data["year"]        = journalTree.getTextFirst("JournalIssue/PubDate/Year", default="")
    if data["year"]=="":
        year = journalTree.getTextFirst("JournalIssue/PubDate/MedlineDate", default="").split()[0]
        if not year.isdigit():
            year = ""
        data["year"] = year
    data["journal"]     = journalTree.getTextFirst("Title", default="")
    data["page"]        = artTree.getTextFirst("Pagination/MedlinePgn", default="")

    authorList  = artTree.getXmlFirst("AuthorList")
    lastNames   = []
    initialList = []
    if authorList!=None:
        authorTrees = authorList.getXmlAll("Author")
        for authorTree in authorTrees:
            lastName = authorTree.getTextFirst("LastName", default="")
            if lastName=="":
                lastName = authorTree.getTextFirst("CollectiveName", default="")
            lastNames.append(lastName)

            initials = authorTree.getTextFirst("ForeName", default="")
            if initials=="":
                initials = authorTree.getTextFirst("Initials", default="")
            initialList.append(initials)

    authors = [lastNames[i]+", "+initialList[i] for i in range(0, min(len(lastNames), len(initialList)))]
    data["authors"]="; ".join(authors)

    articleTypeList = set(artTree.getTextAll("PublicationTypeList/PublicationType"))
    articleTypesString  = ",".join(articleTypeList)

    articleType="research-article"

    noResearchArticleTags = ["Bibliography", "Biography",
        "Case Reports", "Webcasts",
        "Dictionary", "Directory",
        "Editorial", "Festschrift",
        "Patient Education Handout", "Periodical Index",
        "Portraits", "Published Erratum", "Scientific Integrity Review"
        "Congresses"]

    if "Review" in articleTypeList:
       articleType = "review"
    elif "Letter" in articleTypeList:
       articleType = "research-article"
    else:
        for noResearchArticleTag in noResearchArticleTags:
            if noResearchArticleTag in articleTypeList:
                articleType = "other"
                break

    data["articleType"]        = articleType
    #data["pubmedArticleTypes"] = articleTypesString

    logging.log(5, "pubmedArticleTypes %s, articleType %s" % (articleTypesString, articleType))

    meshDescriptors = []
    meshHeadingList       = medlineData.getXmlFirst("MeshHeadingList", default="")
    if meshHeadingList:
        #for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName", reqAttrDict={"MajorTopicYN":"Y"}):
        for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName"):
            meshDescriptors.append(meshHeadingDescriptor.strip())

    data["keywords"] = "/".join(meshDescriptors)

    # remove these annoying linebreaks!
    filtData = {}
    for key, val in data.iteritems():
        filtData[key] = val.replace(u'\u2028', ' ')
    return filtData
Esempio n. 7
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId=int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        #if doi2pmid==None:
            #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"]=zipFilename+":"+filename
        #if articleData["doi"] in doi2pmid:
           #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"]=pii
        articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
Esempio n. 8
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile),
                   basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId = int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" %
                      (fullZipPath, filename, len(inRows) - i))
        #if doi2pmid==None:
        #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" %
                          (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"] = zipFilename + ":" + filename
        #if articleData["doi"] in doi2pmid:
        #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"] = pii
        articleData[
            "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString == None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file
        idRow = [
            str(articleData["articleId"]), articleData["doi"],
            articleData["externalId"],
            str(articleData["pmid"])
        ]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000 * (articleId)) + 1,
                        fileData,
                        externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
Esempio n. 9
0
def parseMedline(xmlParser):
    """
    fill article data dict with pubmed xml data

    >>> xml = PubmedTestDoc()
    >>> data = parseMedline(maxXml.XmlParser(string=xml))
    >>> del data["time"]
    >>> repr(data)
     "OrderedDict([('articleId', ''), ('externalId', 'PMID20430833'), ('source', ''), ('origFile', ''), ('journal', 'Brain : a journal of neurology'), ('printIssn', '0006-8950'), ('eIssn', '0006-8950'), ('journalUniqueId', '0372537'), ('year', '2010'), ('articleType', 'research-article'), ('articleSection', ''), ('authors', u'Willemsen, Mich\\\\xe9l A; Verbeek, Marcel M'), ('authorEmails', ''), ('authorAffiliations', 'Radboud University Nijmegen Medical Centre, Donders Institute for Brain, Cognition and Behaviour, Department of Paediatric Neurology (820 IKNC), PO Box 9101, 6500 HB Nijmegen, The Netherlands. [email protected]'), ('keywords', 'Age of Onset/Useless Research'), ('title', 'Tyrosine hydroxylase deficiency: a treatable disorder of brain catecholamine biosynthesis.'), ('abstract', 'An infantile onset, progressive, hypokinetic-rigid syndrome with dystonia (type A), and a complex encephalopathy with neonatal onset (type B). Decreased cerebrospinal fluid concentrations of homovanillic acid and c.698G>A and c.707T>C mutations. Carriership of at least one promotor mutation, however, apparently predicts type A tyrosine hydroxylase deficiency. Most patients with tyrosine hydroxylase deficiency can be successfully treated with l-dopa.'), ('vol', '133'), ('issue', 'Pt 6'), ('page', '1810-22'), ('pmid', '20430833'), ('pmcId', ''), ('doi', ''), ('fulltextUrl', 'http://www.ncbi.nlm.nih.gov/pubmed/20430833')])"

    """
    data = pubStore.createEmptyArticleDict()
    # medlineData           = xmlParser.getXmlFirst("MedlineCitation")
    medlineData = xmlParser
    data["pmid"] = medlineData.getTextFirst("PMID")
    data["externalId"] = "PMID" + data["pmid"]
    data["fulltextUrl"] = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % data["pmid"]
    logging.log(5, "PMID %s" % data["pmid"])
    # data["year-pubmed"]   = medlineData.getTextFirst("DateCreated/Year")
    # data["month-pubmed"]  = medlineData.getTextFirst("DateCreated/Month")
    # data["day-pubmed"]    = medlineData.getTextFirst("DateCreated/Day")
    otherIds = medlineData.getTextAll("OtherID", reqAttrDict={"Source": "NLM"})
    pmcIds = [i for i in otherIds if i.startswith("PMC")]
    if len(pmcIds) > 0:
        data["pmcId"] = pmcIds[0].split()[0].replace("PMC", "")

    artTree = medlineData.getXmlFirst("Article")
    data["title"] = artTree.getTextFirst("ArticleTitle", default="")

    # handle structured abstracts
    abstractParts = []
    abstractTrees = artTree.getXmlAll("Abstract/AbstractText")
    for aEl in abstractTrees:
        label = aEl.getAttr("NlmCategory")
        abstract = ""
        if label != None:
            abstract = "<p>%s</p> " % label
        abstract += aEl.getText()
        abstractParts.append(abstract)
    data["abstract"] = "".join(abstractParts)

    if data["abstract"] == "":
        data["abstract"] = artTree.getTextFirst("OtherAbstract/AbstractText", default="")

    data["authorAffiliations"] = artTree.getTextFirst("Affiliation", default="")
    data["doi"] = artTree.getTextFirst("ELocationID", default="", reqAttrDict={"EIdType": "doi"})

    data["journalUniqueId"] = medlineData.getTextFirst("MedlineJournalInfo/NlmUniqueID")
    linkingIssn = medlineData.getTextFirst("MedlineJournalInfo/ISSNLinking")

    journalTree = artTree.getXmlFirst("Journal")
    data["eIssn"] = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": "Electronic"}, default="")
    data["printIssn"] = journalTree.getTextFirst("ISSN", reqAttrDict={"IssnType": "Print"}, default="")
    if linkingIssn != None:
        data["eIssn"] = linkingIssn
        data["printIssn"] = linkingIssn

    data["vol"] = journalTree.getTextFirst("JournalIssue/Volume", default="")
    data["issue"] = journalTree.getTextFirst("JournalIssue/Issue", default="")
    data["year"] = journalTree.getTextFirst("JournalIssue/PubDate/Year", default="")
    if data["year"] == "":
        year = journalTree.getTextFirst("JournalIssue/PubDate/MedlineDate", default="").split()[0]
        if not year.isdigit():
            year = ""
        data["year"] = year
    data["journal"] = journalTree.getTextFirst("Title", default="")
    data["page"] = artTree.getTextFirst("Pagination/MedlinePgn", default="")

    authorList = artTree.getXmlFirst("AuthorList")
    lastNames = []
    initialList = []
    if authorList != None:
        authorTrees = authorList.getXmlAll("Author")
        for authorTree in authorTrees:
            lastName = authorTree.getTextFirst("LastName", default="")
            if lastName == "":
                lastName = authorTree.getTextFirst("CollectiveName", default="")
            lastNames.append(lastName)

            initials = authorTree.getTextFirst("ForeName", default="")
            if initials == "":
                initials = authorTree.getTextFirst("Initials", default="")
            initialList.append(initials)

    authors = [lastNames[i] + ", " + initialList[i] for i in range(0, min(len(lastNames), len(initialList)))]
    data["authors"] = "; ".join(authors)

    articleTypeList = artTree.getTextAll("PublicationTypeList/PublicationType")
    articleTypesString = ",".join(articleTypeList)

    articleType = "research-article"

    if "Review" in articleTypeList:
        articleType = "review"
    if "letter" in articleTypeList:
        articleType = "research-article"

    noResearchArticleTags = [
        "Bibliography",
        "Biography",
        "Case Reports",
        "Webcasts",
        "Dictionary",
        "Directory",
        "Editorial",
        "Festschrift",
        "Patient Education Handout",
        "Periodical Index",
        "Portraits",
        "Published Erratum",
        "Scientific Integrity Review" "Congresses",
    ]

    for noResearchArticleTag in noResearchArticleTags:
        if noResearchArticleTag in articleTypeList:
            articleType = "other"

    data["articleType"] = articleType
    # data["pubmedArticleTypes"] = articleTypesString

    logging.log(5, "pubmedArticleTypes %s, articleType %s" % (articleTypesString, articleType))

    meshDescriptors = []
    meshHeadingList = medlineData.getXmlFirst("MeshHeadingList", default="")
    if meshHeadingList:
        # for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName", reqAttrDict={"MajorTopicYN":"Y"}):
        for meshHeadingDescriptor in meshHeadingList.getTextAll("MeshHeading/DescriptorName"):
            meshDescriptors.append(meshHeadingDescriptor.strip())

    data["keywords"] = "/".join(meshDescriptors)

    # remove these annoying linebreaks!
    filtData = {}
    for key, val in data.iteritems():
        filtData[key] = val.replace(u"\u2028", " ")
    return filtData
Esempio n. 10
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u"\xbf" in filename:
            logging.info("Found weird character, skipping file")
            continue

        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename == "":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename + ":" + filename

        if pdfString == None:
            pdfNotFound += 1
            logging.error("Could not open pdf or xml file")
            continue

        articleId = int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"] = zipFilename + "/" + filename
        articleData["externalId"] = articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()
Esempio n. 11
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u'\xbf' in filename:
            logging.info("Found weird character, skipping file")
            continue
        
        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename=="":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename+":"+filename

        if pdfString==None:
            pdfNotFound+=1
            logging.error("Could not open pdf or xml file")
            continue

        articleId=int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"]=zipFilename+"/"+filename
        articleData["externalId"]=articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()