Example #1
0
def generateSideBySide(doc_list):
    """
        Generates side-by-side visualizations of a JATS XML: one using an XML to HTML
        converter, one loading the XML into SciDocJSON and rendering it back as HTML
    """
    from subprocess import Popen
    from read_auto import AutoXMLReader

    reader=AutoXMLReader()
    output_dir=os.path.join(cp.Corpus.ROOT_DIR,"conversion_visualization\\")

    file_list=[]
    for filename in doc_list:
        print("Converting %s" % filename)
        input_file=cp.Corpus.paths.inputXML+filename
        output_file=output_dir+"%s_1.html" % os.path.basename(filename)

##        os_line="..\\..\\libs\\generate_jats_html.bat "+" "+input_file+" "+output_file
##        print(os_line)
##        p = Popen(os_line, cwd=r"..\..\libs")
##        stdout, stderr = p.communicate()

        doc=reader.readFile(input_file)
        try:
            json.dumps(doc.data)
        except:
            print("Not JSON Serializable!!!!")

        html=SciDocRenderer(doc).prettyPrintDocumentHTML(True,True,True, True)
        output_file2=output_file.replace("_1.html","_2.html")
        writeFileText(html,output_file2)
        file_list.append([os.path.basename(output_file),os.path.basename(output_file2)])

    file_list_json="file_data=%s;" % json.dumps(file_list)
    writeFileText(file_list_json,output_dir+"file_data.json")
Example #2
0
def explainAnchorTextZoning(guid, max_inlinks=10, use_full_text=False):
    """
        This generates a clipping collection file, including all the citation
        contexts of other files to this file
    """
    meta=cp.Corpus.getMetadataByGUID(guid)
    all_html=["""<h1 class="title">%s</h1><span>Inlink context summary for %s</span>""" % (meta["title"],formatCitation(meta))]
    global CURRENT_CITATION
    CURRENT_CITATION=re.escape(formatCitation(meta))

    for index, link in enumerate(meta["inlinks"]):
        if index == max_inlinks:
            break
        print("Processing anchor text from %s" % link)
        doc=cp.Corpus.loadSciDoc(link)

        if not use_full_text:
            trimDocToRelevantBits(doc, guid)

        renderer=SciDocRenderer(doc)
        html=renderer.prettyPrintDocumentHTML(
            formatspans=True,
            include_bibliography=use_full_text,
            wrap_with_HTML_tags=False,
            extra_attribute_function=extraAttributes,
            citation_formatting_function=citationFormatting,
            reference_formatting_function=referenceFormatting)
        all_html.append(html)

    html=padWithHTML(" ".join(all_html))
    writeFileText(html,os.path.join(cp.Corpus.paths.output,guid+"_ilc_zoning.html"))
def fixPaperReferences(annotated_file, pmc_file, pmc_id, original_text=None):
    """
        Replaces the <ref-list> section in `annotated_file` with that from
        `pmc_file`

        Checking that they actually the same file is done outside.
    """
    annotated_text=loadFileText(annotated_file)
    if not original_text:
        original_text=loadFileText(pmc_file)

    try:
        orig_start, orig_end=selectRefListSection(original_text,  pmc_file, pmc_id)
    except ValueError:
        return

    original_refs=original_text[orig_start:orig_end]

    try:
        annot_start, annot_end=selectRefListSection(annotated_text, annotated_file, getFilePMCID(annotated_file))
    except ValueError:
        return

    new_annotated_text=annotated_text[:annot_start]+original_text[orig_start:orig_end]+annotated_text[annot_end:]
    writeFileText(new_annotated_text, annotated_file)
Example #4
0
def explainInFileZoning(guid):
    """
        Given a guid, generates a HTML visualization where sentences are tagged
        with their class for the contents of that file
    """
    doc=cp.Corpus.loadSciDoc(guid)
    renderer=SciDocRenderer(doc)
    html=renderer.prettyPrintDocumentHTML(True,True, False, extra_attribute_function=extraAttributes, wrap_with_HTML_tags=False,
        reference_formatting_function=referenceFormatting)
    html=padWithHTML(html)
    writeFileText(html,os.path.join(cp.Corpus.paths.output,guid+"_text_zoning.html"))
    def generateVisualizationFileList(self, files):
        """
            For each element in the list, generates the _data.json file, then
            writes out global file_data.json

            Args:
                files: list of guids
        """
        file_data=[]
        for index, guid in enumerate(files):
            print "Processing #%d - %s" % (index, guid)
            file_info=self.generateVisualizationOneFile(guid)
            file_data.append(file_info)

        json_str="file_data=" + json.dumps(file_data) + ";"
        writeFileText(json_str, self.output_dir+"file_data.json")
Example #6
0
def fixPaperReferences(annotated_file, pmc_file, original_text=None):
    """
        Replaces the <ref-list> section in `annotated_file` with that from
        `pmc_file`

        Checking that they actually the same file is done outside.
    """
    annotated_text = loadFileText(annotated_file)
    if not original_text:
        original_text = loadFileText(pmc_file)

    orig_start, orig_end = selectRefListSection(original_text, pmc_file, pmc_id)
    original_refs = original_text[orig_start:orig_end]

    annot_start, annot_end = selectRefListSection(original_text)
    original_refs = original_text[orig_start:orig_end]

    annotated_text = annotated_text[:annot_start] + original_text[orig_start:orig_end] + annotated_text[annot_end:]
    writeFileText(annotated_text, annotated_file)
Example #7
0
def generateSideBySide(doc_list):
    """
        Generates side-by-side visualizations of a Paper XML: one using an XML to HTML
        converter, one loading the XML into SciDocJSON and rendering it back as HTML
    """
    from subprocess import Popen

    reader=PaperXMLReader()
    output_dir="g:\\nlp\\phd\\aac\\conversion_visualization\\"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_list=[]
    for filename in doc_list:
        print("Converting %s" % filename)
        input_file=cp.Corpus.paths.inputXML+filename
        output_file=output_dir+"%s_1.html" % os.path.basename(filename)

        input_text=loadFileText(input_file)
        writeFileText(input_text,output_file)

        doc=reader.read(input_text, input_file)
        try:
            json.dumps(doc.data)
        except:
            print("Not JSON Serializable!!!!")

        html=SciDocRenderer(doc).prettyPrintDocumentHTML(True,True,True, True)
        output_file2=output_file.replace("_1.html","_2.html")
        writeFileText(html,output_file2)
        file_list.append([os.path.basename(output_file),os.path.basename(output_file2)])

    file_list_json="file_data=%s;" % json.dumps(file_list)
    writeFileText(file_list_json,output_dir+"file_data.json")
    def generateVisualizationOneFile(self, guid):
        """
            Given a guid, it prepares its explainer document
        """
        doc=cp.Corpus.loadSciDoc(guid)
        cp.Corpus.tagAllReferencesAsInCollectionOrNot(doc)
        counts1=self.getDocumentTokens(doc)

        # generate a unique id for each unique term, make a dictionary
        for index, token in enumerate(counts1):
            self.term_info[token]={"token_id":str(index), "references": []}

        self.overlapping_tokens={}

        ref_data={}

        in_collection_references=cp.Corpus.getMetadataByGUID(guid)["outlinks"]
        for ref in doc["references"]:
            match=cp.Corpus.matchReferenceInIndex(ref)
            if match:
                doc2=cp.Corpus.loadSciDoc(match["guid"])
                counts2=self.getDocumentTokens(doc2)
                # for each in_collection_reference number (0 onwards) we store the list
                # of its overlapping tokens with the current document

                self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2)

                for token in self.overlapping_tokens[ref["id"]]:
                    ref_list=self.term_info[token]["references"]
                    if ref["id"] not in ref_list:
                        ref_list.append(ref["id"])

                ref_html=doc2.prettyPrintDocumentHTML(
                    True,
                    True,
                    False,
        ##            extra_attribute_function=self.extraAttributes,
##                    citation_formatting_function=self.citationFormatting,
##                    reference_formatting_function=self.referenceFormatting,
                    text_formatting_function=self.textFormatting
                    )

                details="%s - %s - %s" % (match["guid"], formatCitation(ref), doc2["metadata"]["title"])
                ref_record={"full_html":ref_html, "details":details}
                ref_data[ref["id"]]=ref_record

        # try to find some signal in the noise
        self.filterTokens(doc)

        html=doc.prettyPrintDocumentHTML(
            True,
            True,
            False,
##            extra_attribute_function=self.extraAttributes,
            citation_formatting_function=self.citationFormatting,
            reference_formatting_function=self.referenceFormatting,
            text_formatting_function=self.textFormatting
            )
##        html=self.padWithHTML(html, guid)
        token_data={"full_html":html, "ref_data":ref_data}
        json_str=json.dumps(token_data)
        json_file=guid+"_data.json"
        writeFileText(json_str, self.output_dir+json_file)
        # TODO: generate file description
        # TODO: add title
        details="%s - %s - %s" % (guid, formatCitation(doc["metadata"]), doc["metadata"]["title"])

        file_info={
            "json_file":json_file,
            "id":guid,
            "title":doc["metadata"]["title"],
            "details": details}
        return file_info