Beispiel #1
0
    def wrapInSciDoc(self, contexts, doc_from_id, doc_to_id):
        """
            Returns a SciDoc ready to be passed to the standard context_extract
            functions, where each context is a paragraph

            Args:
                contexts: list of context dicts
                doc_from_id: corpus_id of this SciDoc
                doc_to_id: corpus_id of target document (citation)
            Returns:
                SciDoc
        """
        newDocument=SciDoc()
        metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_from_id)
        if metadata:
            newDocument.loadExistingMetadata(metadata)
            assert newDocument.metadata["guid"] != ""
        else:
            newDocument.metadata["guid"]=doc_from_id
            assert newDocument.metadata["guid"] != ""

        newDocument.metadata["corpus_id"]=doc_from_id

        newSection_id=newDocument.addSection("root", "", 0)

        metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_to_id)
        if not metadata:
            raise ValueError("Target document %s is not in corpus!" % doc_to_id)
            return

        ref=newDocument.addExistingReference(metadata)

        ref["corpus_id"]=doc_to_id

        for context in contexts:
            newPar_id=newDocument.addParagraph(newSection_id)
            for line in context["lines"]:
                newSent_id=newDocument.addSentence(newPar_id)
                text=line["text"]
                citations=[]
                if re.search(CIT_MARKER,text):
                    newCit=newDocument.addCitation(newSent_id, ref["id"])
                    text=re.sub(CIT_MARKER, CITATION_FORM % newCit["id"], text)
                    citations.append(newCit["id"])

                sent=newDocument.element_by_id[newSent_id]
                sent["sentiment"]=line["sentiment"]
                sent["text"]=text
                if len(citations) > 0:
                    sent["citations"]=citations

        return newDocument
Beispiel #2
0
def loadAZSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """

    # main loadSciXML
    text=loadFileText(filename)
    soup=BeautifulStoneSoup(text)

    fileno=soup.find("docno")
    fileno=fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument=SciDoc()
    newDocument["metadata"]["filename"]=os.path.basename(filename)
    newDocument["metadata"]["filepath"]=filename

    paper=soup.find("paper")
    if not paper:
        debugAddMessage(newDocument,"error","NO <PAPER> IN THIS PAPER! file: "+filename)
        return newDocument

    # Load metadata, either from corpus or from file
##    key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
##    if cp.Corpus.metadata_index.has_key(key):
##        metadata=cp.Corpus.metadata_index[key]
##    else:
    metadata=None

    if metadata:
        newDocument["metadata"]["conference"]=""
        for field in metadata:
            newDocument["metadata"][field]=metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno, soup)
##        debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename)

    newDocument["metadata"]["guid"]=cp.Corpus.generateGUID(newDocument["metadata"])

    # Clean up potential weird text in XML metadata
##    makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff

    # Load all references (at the end of the document) from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract=soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: "+ filename+"\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id=newDocument.addSection("root","Abstract")
        newPar_id=newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            addNewSentenceAndProcessRefs(s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence

        newDocument.abstract=newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

# "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
# No functiona por: unicode
##    for ref in newDocument["references"]:
##        k=ref.get("AZ",["NO AZ"])
##        print k, most_common(k)

    return newDocument