Ejemplo n.º 1
0
def generateDocBOW_ILC_Annotated(doc_incoming, parameters, doctext=None, full_paragraph=True):
    """
        Create a BOW from all the inlink contexts of a given document
        Extracts sentences around the citation, annotated with their AZ

        Args:
            doc_incoming: for compatibility, SciDoc or dict with .metadata["guid"]
            parameters = {"full_paragraph":True,"sent_left":1, "sent_right":1}?
    """
    doc_incoming_guid=doc_incoming.metadata["guid"]
    all_contexts=defaultdict(lambda:[])
    for param in parameters:
        all_contexts[param]=[]

    doc_metadata=cp.Corpus.getMetadataByGUID(doc_incoming_guid)
    print("Building VSM representations for ", doc_metadata["guid"], ":", len(doc_metadata["inlinks"]), "incoming links")

    for inlink_guid in doc_metadata["inlinks"]:
        # loads from cache if exists, XML otherwise
        docfrom=cp.Corpus.loadSciDoc(inlink_guid)
##        cp.Corpus.annotateDoc(docfrom,["AZ"])

        # important! the doctext here has to be that of the docfrom, NOT doc_incoming
        doctext=docfrom.getFullDocumentText()
        ref_id=identifyReferenceLinkIndex(docfrom, doc_incoming_guid)

        print("Document with incoming citation links loaded:", docfrom["metadata"]["filename"])

        for param in parameters:
            citations=[cit for cit in docfrom["citations"] if cit["ref_id"] == ref_id]
            for cit in citations:
                to_add=selectSentencesToAdd(docfrom,cit,param)

                context={"ilc_AZ_"+zone:"" for zone in AZ_ZONES_LIST}
                for zone in CORESC_LIST:
                    context["ilc_CSC_"+zone]=""

                for sent_id in to_add:
                    sent=docfrom.element_by_id[sent_id]
                    text=formatSentenceForIndexing(sent)
                    if sent.get("az","") != "":
                        context["ilc_AZ_"+sent.get("az","")]+=" "+text

                    if "csc_type" not in sent:
                        sent["csc_type"]="Bac"
                    context["ilc_CSC_"+sent["csc_type"]]+=" "+text

                context["guid_from"]=docfrom["metadata"]["guid"]
                context["year_from"]=docfrom["metadata"]["year"]
                all_contexts[param].append(context)

    #   this bit of code makes every entry a list for multiple representations from each document
##    for c in all_contexts:
##        all_contexts[c]=[all_contexts[c]]
    return all_contexts
Ejemplo n.º 2
0
def getDocBOWrandomZoning(doc, parameters=None, doctext=None, keys=["az","csc_type"]):
    """
        Get BOW for document with randomized AZ/CSC
    """
    res=defaultdict(lambda:"")
    for sentence in doc.allsentences:
        text=formatSentenceForIndexing(sentence)
        res[random.choice(AZ_ZONES_LIST)]+=" " + text
        res[random.choice(CORESC_LIST)]+=" " + text

    addDocBOWFullTextField(doc,res,doctext)
    return {1:[res]}
Ejemplo n.º 3
0
def getDocBOWannotated(doc, parameters=None, doctext=None, keys=["az","csc_type"]):
    """
        Get BOW for document with AZ/CSC
    """
    res=defaultdict(lambda:[])
    for sentence in doc.allsentences:
        text=formatSentenceForIndexing(sentence)
        for key in keys:
            if sentence.has_key(key):
                res[sentence[key]].append(text)
    for key in res:
        res[key]=" ".join(res[key])
    addDocBOWFullTextField(doc,res,doctext)
    return {1:[res]}