Python Parser Exemples, pybtex.database.input.bibtexml.Parser Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : read_paperxml.py Projet : danieldmm/minerva

    def loadPaperMetadata(self, newDocument, soup, filename):
        """
            Tries to recover metadata from Paper file
        """
        header=soup.find("firstpageheader")
        if header:
            title=header.find("title")
            if title:
                newDocument.metadata["title"]=title.text

        path,fname=os.path.split(filename)
        metafilename=re.sub(r"(.*)-paper.xml",r"\1.xml",fname,flags=re.IGNORECASE)
        metafilename=os.path.join(path, metafilename)

        self.bibtex_parser = BibTeXMLParser()
##        print("trying to load BibTeXML from ", metafilename)
        try:
            bib_data = self.bibtex_parser.parse_file(metafilename)
        except BibliographyDataError as e:
            print(e)
        except:
            print("COULDN'T LOAD BIBTEXML FOR ",metafilename)
            bib_data=None

        if bib_data:
            entry=bib_data.entries[bib_data.entries.keys()[0]]
            for field in entry.fields:
                newDocument.metadata[field]=entry.fields[field].replace(u"\u2013",u"-")

        authors=[]
        for a in header.findChildren("author"):
            authors.append(self.loadPaperMainAuthorXML(a))
        newDocument["metadata"]["authors"]=authors
        newDocument["metadata"]["surnames"]=[a["family"] for a in authors]
        newDocument["metadata"]["norm_title"]=normalizeTitle(newDocument["metadata"]["title"])

Exemple #2

0

Afficher le fichier

    def loadPaperMetadata(self, newDocument, soup, filename):
        """
            Tries to recover metadata from Paper file
        """
        header = soup.find("firstpageheader")
        if header:
            title = header.find("title")
            if title:
                newDocument.metadata["title"] = title.text

        path, fname = os.path.split(filename)
        metafilename = re.sub(r"(.*)-paper.xml", r"\1.xml", fname, flags=re.IGNORECASE)
        metafilename = os.path.join(path, metafilename)

        self.bibtex_parser = BibTeXMLParser()
        ##        print("trying to load BibTeXML from ", metafilename)
        try:
            bib_data = self.bibtex_parser.parse_file(metafilename)
        except BibliographyDataError as e:
            print(e)
        except:
            print("COULDN'T LOAD BIBTEXML FOR ", metafilename)
            bib_data = None

        if bib_data:
            entry = bib_data.entries[list(bib_data.entries.keys())[0]]
            for field in entry.fields:
                newDocument.metadata[field] = entry.fields[field].replace(u"\u2013", u"-")

        authors = []
        for a in header.findChildren("author"):
            authors.append(self.loadPaperMainAuthorXML(a))
        newDocument["metadata"]["authors"] = authors
        newDocument["metadata"]["surnames"] = [a["family"] for a in authors]
        newDocument["metadata"]["norm_title"] = normalizeTitle(newDocument["metadata"]["title"])

Exemple #3

0

Afficher le fichier

Fichier : read_paperxml.py Projet : danieldmm/minerva

class PaperXMLReader(BaseSciDocXMLReader):
    """
        Reader class for Paper/NLM XML

        read()
    """
    def __init__(self, parscit_api_url="http://127.0.0.1:5000/parscit/"):
        self.parscit_client=ParsCitClient(parscit_api_url)

    def cleanUpPaperXML(self, xml):
        """
            Cleans up some messy stuff in PaperXML
        """
        xml=xml.replace(u"\xad","") # this is badly processed end-of-line hyphens
        xml=normalizeUnicode(xml)
        return xml

    def cleanUpReferencesText(self, reftext):
        """
            Same as above, but called only on references it takes less time
        """
        reftext=re.sub(r"Proceedings","Proceedings",reftext)
        reftext=re.sub(r"[Ii]n\s?\s?[Pp]roceedings\s+?of\s+?","In Proceedings of ", reftext)
        reftext=re.sub(r"([Ii]n)([A-Z\d])",r"\1 \2", reftext)
        # Break reference lines after "In proceedings of [x]. Name, Name, date..."
##        reftext=re.sub(r"((ceedings\s+of.{4,40}|pages.{4,12})\.\s*)([A-Z][a-z]{1,14}\s+[A-Z].{1,14})",r"\1 \n\n \3",reftext)
        # add space after commas. Seems to affect conversion quality a lot
        reftext=re.sub(r"([,;\"])([^\s])",r"\1 \2",reftext)
        # make sure there's space after a full stop
        reftext=re.sub(r"([a-z\?\!]\.)([A-Z])",r"\1 \2",reftext)
        # make sure there's space after a date's dot, parenthesis, or lack of space
        reftext=re.sub(r"((?:19|20)[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)(\)?\.?)([\w\&;\"\'])",r"\1\2 \3",reftext)
        # Normalize line breaks
        reftext=re.sub(r"\n\r?\s?\n\r?\s?\n\r?\s?\n\r?\s?", r"\n\n",reftext)
        # Break apart several references on the same line
        # using apa_author and apa_year_num
        reftext=re.sub(r"(\w{2,150}\.) ((?:(?:(?:de |von |van )?[A-Z][A-Za-z'`-]+, [A-Z]\.) (?:and )?)+\((?:(?:19|20)[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)\))",r"\1 \n\n \2",reftext)
        # similar to above, different format.
        reftext=re.sub(r"(\w{2,150}\.)  ?((?:(?:(?:de |von |van )?[A-Z][A-Za-z'`-]+[ \.])+(?:and )?)+\s?\(?(?:(?:19|20)?[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)\)?)",r"\1 \n\n \2",reftext)
##        ref_lines=re.split(r"\n\r?\n\r?",reftext)
##        print(reftext)
        return reftext


    def loadPaperMainAuthorXML(self, author_node):
        """
            Returns BibJSON-compatible author info

            Args:
                author_node: XML node
            Returns:
                dict with author data
        """
        res={"family":author_node.get("surname",""),"given":author_node.get("givenname","")}
        orgs=author_node.findAll("org")
        if orgs != []:
            res["affiliation"]=[]
            for org in orgs:
                location=",".join([org.get("city",""),org.get("country","")]).strip(",")
                res["affiliation"].append({"name":org.get("name",""),"location":location})
        res=normalizeAuthor(res)
        return res

    def loadPaperMetadata(self, newDocument, soup, filename):
        """
            Tries to recover metadata from Paper file
        """
        header=soup.find("firstpageheader")
        if header:
            title=header.find("title")
            if title:
                newDocument.metadata["title"]=title.text

        path,fname=os.path.split(filename)
        metafilename=re.sub(r"(.*)-paper.xml",r"\1.xml",fname,flags=re.IGNORECASE)
        metafilename=os.path.join(path, metafilename)

        self.bibtex_parser = BibTeXMLParser()
##        print("trying to load BibTeXML from ", metafilename)
        try:
            bib_data = self.bibtex_parser.parse_file(metafilename)
        except BibliographyDataError as e:
            print(e)
        except:
            print("COULDN'T LOAD BIBTEXML FOR ",metafilename)
            bib_data=None

        if bib_data:
            entry=bib_data.entries[bib_data.entries.keys()[0]]
            for field in entry.fields:
                newDocument.metadata[field]=entry.fields[field].replace(u"\u2013",u"-")

        authors=[]
        for a in header.findChildren("author"):
            authors.append(self.loadPaperMainAuthorXML(a))
        newDocument["metadata"]["authors"]=authors
        newDocument["metadata"]["surnames"]=[a["family"] for a in authors]
        newDocument["metadata"]["norm_title"]=normalizeTitle(newDocument["metadata"]["title"])
##        print (json.dumps(newDocument.metadata),"\n\n")

    def loadPaperAbstract(self, soup, newDocument):
        """
            Loads the abstract, including sections
        """
        abstract_node=soup.find("abstract")
        if not abstract_node:
            debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: %s\n" % newDocument.metadata.get("filename","None"))
            # !TODO: LOAD first paragraph as abstract if no abstract available?
        else:
            abstract=newDocument.addSection("root","Abstract")

            paras=abstract_node.findAll("p")
            if len(paras) == 0:
                paras.append(abstract)
            for p in paras:
                self.loadPaperParagraph(p,newDocument,abstract["id"])

            newDocument.abstract=abstract

    def loadPaperSection(self, sec, newDocument, parent):
        """
            Gets called for each section.

            Args:
                sec: XML node
                newDocument: SciDoc
                parent: id of this section's parent in newDocument
        """
        header_id=0 # CHANGE
        header_text=sec.get("title","")
        # make sure first letter is capitalized
        if len(header_text) > 0:
            header_text=header_text[0].upper()+header_text[1:]

        newSection=newDocument.addSection(parent, header_text, header_id)

        contents=sec.findChildren(["subsection", "p", "figure"], recursive=False)
        if contents:
            for element in contents:
                if element.name=="subsection":
                    self.loadPaperSection(element,newDocument,newSection["id"])
                elif element.name=="p":
                    newPar=self.loadPaperParagraph(element, newDocument, newSection["id"])
                elif element.name=="figure":
                    newPar=newDocument.addParagraph(newSection["id"])
                    newSent=newDocument.addSentence(newPar["id"],"")
                    newSent["text"]=element.get("caption","")
                    newSent["type"]="fig-caption"
                    # TODO improve figure loading

    def loadPaperSentence(self, s, newDocument, parent):
        """
            Given a string, adds the sentence to the SciDoc, parses the citations,
            matches them with the references

            Args:
                s: string
                newDocument: SciDoc
                parent: id of element this sentence will hang from (p)
        """

        def replaceTempCitToken(s, temp, final):
            """
                replace temporary citation placeholder with permanent one
            """
            return re.sub(CITATION_FORM % temp, CITATION_FORM % final, annotated_s, flags=re.IGNORECASE)

        newSent=newDocument.addSentence(parent,"")

        annotated_s,citations_found=annotateCitationsInSentence(s, newDocument.metadata["original_citation_style"])
        annotated_citations=[]

        if newDocument.metadata["original_citation_style"]=="APA":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                reference=matchCitationWithReference(citation, newDocument["references"])
##                print (citation["text"]," -> ", formatReference(reference))
                if reference:
                    newCit["ref_id"]=reference["id"]
                else:
                    # do something else?
                    newCit["ref_id"]=None
                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        elif newDocument.metadata["original_citation_style"]=="AFI":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                # TODO check this: maybe not this simple. May need matching function.
                newCit["ref_id"]="ref"+str(int(citation["num"])-1)

                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        newSent["citations"]=[acit["id"] for acit in annotated_citations]
        newSent["text"]=annotated_s

        # deal with many citations within characters of each other: make them know they are a cluster
        # TODO cluster citations? Store them in some other way?
        newDocument.countMultiCitations(newSent)


    def loadPaperParagraph(self, p, newDocument, parent_id):
        """
            Creates a paragraph in newDocument, splits the text into sentences,
            creates a sentence object for each
        """
        if p.parent.name == "td":
            # This is not a content paragraph, but the content of a table cell
            return None

        par_text=p.renderContents(encoding=None)
        if re.match(r"(<i>)?proceedings\s+of\s+the\s+.*",par_text,flags=re.IGNORECASE):
            # This is not a content paragraph, we throw it away
            return None

        newPar=newDocument.addParagraph(parent_id)

        try:
            sentences=sentenceSplit(par_text)
        except:
            print("UNICODE ERROR!",par_text)
            sentences=[par_text]

        for s in sentences:
            self.loadPaperSentence(s,newDocument,newPar["id"])
        return newPar

    def loadPaperReferences(self, ref_section, doc):
        """
            Load the reference section

            Args:
                ref: xml node for reference element
                doc: SciDoc instance we're loading this for
            Returns: dict with the new loaded reference
        """

        all_elements=ref_section.findAll(["p","doubt"])
        process_elements=[]
        for index,element in enumerate(all_elements):
            if element.name=="doubt":
                if len(all_elements) > index+1:
                    all_elements[1].setString(element.text+" "+all_elements[1].text)
                elif index > 0:
                    all_elements[index-1].setString(all_elements[index-1].text+ " " + element.text)

        plain_text=[]
        for element in ref_section.findAll(["p"]):
            text=element.text
            plain_text.append(re.sub(r"</?i>"," ",text))


        reftext="\n\n".join(plain_text)
        # clean up the terrible references text, normalize spaces, commas, etc.
        reftext=self.cleanUpReferencesText(reftext)

        if plain_text == []:
            print("WARNING: NO REFERENCES! in ", doc.metadata.get("filename",""))
        else:
            parsed_refs=self.parscit_client.extractReferenceList(reftext)
            if parsed_refs:
                for ref in parsed_refs:
                    doc.addExistingReference(ref)
            else:
                raise ValueError("Couldn't parse references! in %s" % doc.metadata.get("filename",""))
                #TODO integrate FreeCite/others
                pass

##        print (json.dumps(doc.references))

    def read(self, xml, identifier):
        """
            Load a PaperXML into a SciDoc.

            Args:
                xml: full xml string
                identifier: an identifier for this document, e.g. file name
                        Important: supply an actual path so that we can check
                        for the meatadata in bibtexml
                        If an actual full path, the path will be removed from it
                        when stored
            Returns:
                SciDoc instance
        """
##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        xml=self.cleanUpPaperXML(xml)
        soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
##        if not citation_style:
##            raise ValueError("Cannot determine citation style")
            # default citation style if not otherwise detected
##            citation_style="APA"
        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
##            newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"])
            return newDocument


        # Load metadata, either from corpus or from file
        self.loadPaperMetadata(newDocument, soup, identifier)
        if metadata["surnames"] == []:
            debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier)
            return newDocument

        if metadata["title"] == []:
            debugAddMessage(newDocument,"error","NO TITLE file: "+identifier)
            return newDocument

        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        references=body.find("references")
        if references:
            self.loadPaperReferences(references, newDocument)

        newDocument.updateReferences()
##        print (newDocument.references)
##        print("\n\n")
        sections=body.findChildren("section", recursive=False)

        detect_style_text="".join([sec.renderContents() for sec in sections[:3]])
##        citation_style=detectCitationStyle(detect_style_text, default="APA")
        # turns out I don't have a good detection algorithm for AFI
        citation_style="APA"
        metadata["original_citation_style"]=citation_style

        # Load Abstract
        self.loadPaperAbstract(soup,newDocument)

        for sec in sections:
            self.loadPaperSection(sec, newDocument, "root")

        newDocument.updateReferences()
        newDocument.updateAuthorsAffiliations()
        return newDocument

Exemple #4

0

Afficher le fichier

class PaperXMLReader(BaseSciDocXMLReader):
    """
        Reader class for Paper/NLM XML

        read()
    """

    def __init__(self, parscit_api_url="http://127.0.0.1:5123/parscit/"):
        self.parscit_client = ParsCitClient(parscit_api_url)

    def cleanUpPaperXML(self, xml):
        """
            Cleans up some messy stuff in PaperXML
        """
        xml = xml.replace(u"\xad", "")  # this is badly processed end-of-line hyphens
        xml = normalizeUnicode(xml)
        return xml

    def cleanUpReferencesText(self, reftext):
        """
            Same as above, but called only on references it takes less time
        """
        reftext = re.sub(r"Proceedings", "Proceedings", reftext)
        reftext = re.sub(r"[Ii]n\s?\s?[Pp]roceedings\s+?of\s+?", "In Proceedings of ", reftext)
        reftext = re.sub(r"([Ii]n)([A-Z\d])", r"\1 \2", reftext)
        # Break reference lines after "In proceedings of [x]. Name, Name, date..."
        ##        reftext=re.sub(r"((ceedings\s+of.{4,40}|pages.{4,12})\.\s*)([A-Z][a-z]{1,14}\s+[A-Z].{1,14})",r"\1 \n\n \3",reftext)
        # add space after commas. Seems to affect conversion quality a lot
        reftext = re.sub(r"([,;\"])([^\s])", r"\1 \2", reftext)
        # make sure there's space after a full stop
        reftext = re.sub(r"([a-z\?\!]\.)([A-Z])", r"\1 \2", reftext)
        # make sure there's space after a date's dot, parenthesis, or lack of space
        reftext = re.sub(r"((?:19|20)[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)(\)?\.?)([\w\&;\"\'])",
                         r"\1\2 \3", reftext)
        # Normalize line breaks
        reftext = re.sub(r"\n\r?\s?\n\r?\s?\n\r?\s?\n\r?\s?", r"\n\n", reftext)
        # Break apart several references on the same line
        # using apa_author and apa_year_num
        reftext = re.sub(
            r"(\w{2,150}\.) ((?:(?:(?:de |von |van )?[A-Z][A-Za-z'`-]+, [A-Z]\.) (?:and )?)+\((?:(?:19|20)[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)\))",
            r"\1 \n\n \2", reftext)
        # similar to above, different format.
        reftext = re.sub(
            r"(\w{2,150}\.)  ?((?:(?:(?:de |von |van )?[A-Z][A-Za-z'`-]+[ \.])+(?:and )?)+\s?\(?(?:(?:19|20)?[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)\)?)",
            r"\1 \n\n \2", reftext)
        ##        ref_lines=re.split(r"\n\r?\n\r?",reftext)
        ##        print(reftext)
        return reftext

    def loadPaperMainAuthorXML(self, author_node):
        """
            Returns BibJSON-compatible author info

            Args:
                author_node: XML node
            Returns:
                dict with author data
        """
        res = {"family": author_node.get("surname", ""), "given": author_node.get("givenname", "")}
        orgs = author_node.findAll("org")
        if orgs != []:
            res["affiliation"] = []
            for org in orgs:
                location = ",".join([org.get("city", ""), org.get("country", "")]).strip(",")
                res["affiliation"].append({"name": org.get("name", ""), "location": location})
        res = normalizeAuthor(res)
        return res

    def loadPaperMetadata(self, newDocument, soup, filename):
        """
            Tries to recover metadata from Paper file
        """
        header = soup.find("firstpageheader")
        if header:
            title = header.find("title")
            if title:
                newDocument.metadata["title"] = title.text

        path, fname = os.path.split(filename)
        metafilename = re.sub(r"(.*)-paper.xml", r"\1.xml", fname, flags=re.IGNORECASE)
        metafilename = os.path.join(path, metafilename)

        self.bibtex_parser = BibTeXMLParser()
        ##        print("trying to load BibTeXML from ", metafilename)
        try:
            bib_data = self.bibtex_parser.parse_file(metafilename)
        except BibliographyDataError as e:
            print(e)
        except:
            print("COULDN'T LOAD BIBTEXML FOR ", metafilename)
            bib_data = None

        if bib_data:
            entry = bib_data.entries[list(bib_data.entries.keys())[0]]
            for field in entry.fields:
                newDocument.metadata[field] = entry.fields[field].replace(u"\u2013", u"-")

        authors = []
        for a in header.findChildren("author"):
            authors.append(self.loadPaperMainAuthorXML(a))
        newDocument["metadata"]["authors"] = authors
        newDocument["metadata"]["surnames"] = [a["family"] for a in authors]
        newDocument["metadata"]["norm_title"] = normalizeTitle(newDocument["metadata"]["title"])

    ##        print (json.dumps(newDocument.metadata),"\n\n")

    def loadPaperAbstract(self, soup, newDocument):
        """
            Loads the abstract, including sections
        """
        abstract_node = soup.find("abstract")
        if not abstract_node:
            debugAddMessage(newDocument, "error",
                            "CANNOT LOAD ABSTRACT! file: %s\n" % newDocument.metadata.get("filename", "None"))
            # !TODO: LOAD first paragraph as abstract if no abstract available?
        else:
            abstract = newDocument.addSection("root", "Abstract")

            paras = abstract_node.findAll("p")
            if len(paras) == 0:
                paras.append(abstract)
            for p in paras:
                self.loadPaperParagraph(p, newDocument, abstract["id"])

            newDocument.abstract = abstract

    def loadPaperSection(self, sec, newDocument, parent):
        """
            Gets called for each section.

            Args:
                sec: XML node
                newDocument: SciDoc
                parent: id of this section's parent in newDocument
        """
        header_id = 0  # CHANGE
        header_text = sec.get("title", "")
        # make sure first letter is capitalized
        if len(header_text) > 0:
            header_text = header_text[0].upper() + header_text[1:]

        newSection = newDocument.addSection(parent, header_text, header_id)

        contents = sec.findChildren(["subsection", "p", "figure"], recursive=False)
        if contents:
            for element in contents:
                if element.name == "subsection":
                    self.loadPaperSection(element, newDocument, newSection["id"])
                elif element.name == "p":
                    newPar = self.loadPaperParagraph(element, newDocument, newSection["id"])
                elif element.name == "figure":
                    newPar = newDocument.addParagraph(newSection["id"])
                    newSent = newDocument.addSentence(newPar["id"], "")
                    newSent["text"] = element.get("caption", "")
                    newSent["type"] = "fig-caption"
                    # TODO improve figure loading

    def loadPaperSentence(self, s, newDocument, parent):
        """
            Given a string, adds the sentence to the SciDoc, parses the citations,
            matches them with the references

            Args:
                s: string
                newDocument: SciDoc
                parent: id of element this sentence will hang from (p)
        """
        newSent = newDocument.addSentence(parent, s)
        annotatePlainTextCitationsInSentence(newSent, newDocument)

    def loadPaperParagraph(self, p, newDocument, parent_id):
        """
            Creates a paragraph in newDocument, splits the text into sentences,
            creates a sentence object for each
        """
        if p.parent.name == "td":
            # This is not a content paragraph, but the content of a table cell
            return None

        par_text = p.renderContents(encoding=None)
        if re.match(r"(<i>)?proceedings\s+of\s+the\s+.*", par_text, flags=re.IGNORECASE):
            # This is not a content paragraph, we throw it away
            return None

        newPar = newDocument.addParagraph(parent_id)

        try:
            sentences = sentenceSplit(par_text)
        except:
            print("UNICODE ERROR!", par_text)
            sentences = [par_text]

        for s in sentences:
            self.loadPaperSentence(s, newDocument, newPar["id"])
        return newPar

    def loadPaperReferences(self, ref_section, doc):
        """
            Load the reference section

            Args:
                ref: xml node for reference element
                doc: SciDoc instance we're loading this for
            Returns: dict with the new loaded reference
        """

        all_elements = ref_section.findAll(["p", "doubt"])
        process_elements = []
        for index, element in enumerate(all_elements):
            if element.name == "doubt":
                if len(all_elements) > index + 1:
                    all_elements[1].string = element.text + " " + all_elements[1].text
                elif index > 0:
                    all_elements[index - 1].string = all_elements[index - 1].text + " " + element.text

        plain_text = []
        for element in ref_section.findAll(["p"]):
            text = element.text
            plain_text.append(re.sub(r"</?i>", " ", text))

        reftext = "\n\n".join(plain_text)
        # clean up the terrible references text, normalize spaces, commas, etc.
        reftext = self.cleanUpReferencesText(reftext)

        if plain_text == []:
            print("WARNING: NO REFERENCES! in ", doc.metadata.get("filename", ""))
        else:
            parsed_refs = self.parscit_client.extractReferenceList(reftext)
            if parsed_refs:
                for ref in parsed_refs:
                    doc.addExistingReference(ref)
            else:
                raise ValueError("Couldn't parse references! in %s" % doc.metadata.get("filename", ""))
                # TODO integrate FreeCite/others
                pass

    ##        print (json.dumps(doc.references))

    def read(self, xml, identifier):
        """
            Load a PaperXML into a SciDoc.

            Args:
                xml: full xml string
                identifier: an identifier for this document, e.g. file name
                        Important: supply an actual path so that we can check
                        for the meatadata in bibtexml
                        If an actual full path, the path will be removed from it
                        when stored
            Returns:
                SciDoc instance
        """
        ##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        ##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        xml = self.cleanUpPaperXML(xml)
        soup = BeautifulSoup(xml, "xml")

        # Create a new SciDoc to store the paper
        newDocument = SciDoc()
        metadata = newDocument["metadata"]
        metadata["filename"] = os.path.basename(identifier)
        ##        if not citation_style:
        ##            raise ValueError("Cannot determine citation style")
        # default citation style if not otherwise detected
        ##            citation_style="APA"
        body = soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument, "error", "NO <BODY> IN THIS PAPER! file: " + identifier)
            ##            newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"])
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadPaperMetadata(newDocument, soup, identifier)
        if metadata["surnames"] == []:
            debugAddMessage(newDocument, "error", "NO SURNAMES OF AUTHORS file: " + identifier)
            return newDocument

        if metadata["title"] == []:
            debugAddMessage(newDocument, "error", "NO TITLE file: " + identifier)
            return newDocument

        metadata["guid"] = cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        references = body.find("references")
        if references:
            self.loadPaperReferences(references, newDocument)

        newDocument.updateReferences()
        ##        print (newDocument.references)
        ##        print("\n\n")
        sections = body.findChildren("section", recursive=False)

        detect_style_text = "".join([sec.renderContents() for sec in sections[:3]])
        ##        citation_style=detectCitationStyle(detect_style_text, default="APA")
        # turns out I don't have a good detection algorithm for AFI
        citation_style = "APA"
        metadata["original_citation_style"] = citation_style

        # Load Abstract
        self.loadPaperAbstract(soup, newDocument)

        for sec in sections:
            self.loadPaperSection(sec, newDocument, "root")

        newDocument.updateReferences()
        newDocument.updateAuthorsAffiliations()
        return newDocument