Example #1
0
def generateDocBOW_ILC_Annotated(doc_incoming, parameters, doctext=None, full_paragraph=True):
    """
        Create a BOW from all the inlink contexts of a given document
        Extracts sentences around the citation, annotated with their AZ

        Args:
            doc_incoming: for compatibility, SciDoc or dict with .metadata["guid"]
            parameters = {"full_paragraph":True,"sent_left":1, "sent_right":1}?
    """
    doc_incoming_guid=doc_incoming.metadata["guid"]
    all_contexts=defaultdict(lambda:[])
    for param in parameters:
        all_contexts[param]=[]

    doc_metadata=cp.Corpus.getMetadataByGUID(doc_incoming_guid)
    print("Building VSM representations for ", doc_metadata["guid"], ":", len(doc_metadata["inlinks"]), "incoming links")

    for inlink_guid in doc_metadata["inlinks"]:
        # loads from cache if exists, XML otherwise
        docfrom=cp.Corpus.loadSciDoc(inlink_guid)
##        cp.Corpus.annotateDoc(docfrom,["AZ"])

        # important! the doctext here has to be that of the docfrom, NOT doc_incoming
        doctext=docfrom.getFullDocumentText()
        ref_id=identifyReferenceLinkIndex(docfrom, doc_incoming_guid)

        print("Document with incoming citation links loaded:", docfrom["metadata"]["filename"])

        for param in parameters:
            citations=[cit for cit in docfrom["citations"] if cit["ref_id"] == ref_id]
            for cit in citations:
                to_add=selectSentencesToAdd(docfrom,cit,param)

                context={"ilc_AZ_"+zone:"" for zone in AZ_ZONES_LIST}
                for zone in CORESC_LIST:
                    context["ilc_CSC_"+zone]=""

                for sent_id in to_add:
                    sent=docfrom.element_by_id[sent_id]
                    text=formatSentenceForIndexing(sent)
                    if sent.get("az","") != "":
                        context["ilc_AZ_"+sent.get("az","")]+=" "+text

                    if "csc_type" not in sent:
                        sent["csc_type"]="Bac"
                    context["ilc_CSC_"+sent["csc_type"]]+=" "+text

                context["guid_from"]=docfrom["metadata"]["guid"]
                context["year_from"]=docfrom["metadata"]["year"]
                all_contexts[param].append(context)

    #   this bit of code makes every entry a list for multiple representations from each document
##    for c in all_contexts:
##        all_contexts[c]=[all_contexts[c]]
    return all_contexts