Example #1
0
    def deserialize_prev_num_ops(self, gt_file_path):
        """ 
        Deserializes the previous stats related to the given stats. 
        """
        serial_file_path = self.define_serialization_file_path(gt_file_path)

        # Abort if there is no such serialization file.
        if file_util.is_missing_or_empty_file(serial_file_path):
            return

        # Iterate through the lines of file to get only the last line.
        prev_num_ops = []
        with open(serial_file_path, "r") as f:
            for line in f:
                # Ignore first field (date).
                prev_num_ops = [int(x) for x in line.strip().split("\t")[1:]]

        # Abort if list doesn't contain at least the expected number of fields.
        if len(prev_num_ops) < 8:
            return

        counter = Counter()
        counter["num_para_splits"] = prev_num_ops[0]
        counter["num_para_merges"] = prev_num_ops[1]
        counter["num_para_rearranges"] = prev_num_ops[2]
        counter["num_para_inserts"] = prev_num_ops[3]
        counter["num_para_deletes"] = prev_num_ops[4]
        counter["num_word_inserts"] = prev_num_ops[5]
        counter["num_word_deletes"] = prev_num_ops[6]
        counter["num_word_replaces"] = prev_num_ops[7]

        return counter
Example #2
0
 def create_plain_output(self, raw_output_path):
     """ 
     Formats the given file.
     """
     
     #<DOCUMENT>
     #  <METADATA>
     #  ...
     #  </METADATA>
     #  <PAGE width="612" height="792" number="1" id="p1">
     #    ...
     #    <BLOCK id="p1_b1">
     #      <TEXT width="18" height="510.48" id="p1_t1" x="18.34" y="115.52">
     #        <TOKEN sid="p1_s3" id="p1_w1" ...>arXiv:cond-mat/0001220v1</TOKEN>
     #        <TOKEN sid="p1_s5" id="p1_w2" ...>[cond-mat.stat-mech]</TOKEN>
     #        <TOKEN sid="p1_s7" id="p1_w3" ...>17</TOKEN>
     #        <TOKEN sid="p1_s8" id="p1_w4" ...>Jan</TOKEN>
     #        <TOKEN sid="p1_s9" id="p1_w5" ...>2000</TOKEN>
     #      </TEXT>
     #    </BLOCK>
     #  </PAGE>
     #</DOCUMENT>
     
     formatted_lines = []       
     if not file_util.is_missing_or_empty_file(raw_output_path):
         xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))
         block_nodes = xml.xpath(blocks_xpath)
         blocks = []
         for block_node in block_nodes:
             token_nodes = block_node.xpath(token_xpath)
             blocks.append(" ".join([x.text for x in token_nodes if x.text is not None]))      
         return "\n\n".join(blocks)
     return ""
    def create_plain_output(self, raw_output_path):
        """ 
        Formats the given file.
        """

        if not file_util.is_missing_or_empty_file(raw_output_path):
            xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))

            sections = []

            # Extract the title.
            title_nodes = xml.xpath(title_xpath)
            sections.append("".join(
                [x.text.replace("\n", " ").strip() for x in title_nodes]))

            # Extract the lines.
            section_nodes = xml.xpath(sections_xpath)
            for node in section_nodes:
                line_nodes = node.xpath(line_xpath)
                sections.append("\n".join([
                    x.text.replace("\n", " ").strip() for x in line_nodes
                    if x is not None and x.text is not None
                ]))
            return "\n\n".join(sections)
        return ""
Example #4
0
    def create_plain_output(self, raw_output_path):
        """ Creates a plain txt output from given raw output file. Override
        it if you have to do more advanced stuff."""

        if not file_util.is_missing_or_empty_file(raw_output_path):
            with open(raw_output_path, "r",
                      errors='ignore') as raw_output_file:
                return raw_output_file.read()
        else:
            return ""
Example #5
0
    def create_plain_output(self, raw_output_path):
        """ 
        Formats the given file.
        """

        if not file_util.is_missing_or_empty_file(raw_output_path):
            xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))
            p_nodes = xml.xpath(p_xpath)
            return "\n\n".join([
                x.text for x in p_nodes if x is not None and x.text is not None
            ])
        return ""
Example #6
0
    def create_plain_output(self, raw_output_path):
        ''' Reads the given actual file. Override it if you have to do more 
        advanced stuff, like removing semantic markups, etc.'''

        if not file_util.is_missing_or_empty_file(raw_output_path):
            xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))

            text_block_nodes = xml.xpath(text_blocks_xpath)
            text_blocks = [
                x.text for x in text_block_nodes
                if x.text != "[empty:empty]" and x.text != "[empty:spaces]"
            ]
            return "\n\n".join(text_blocks)
        return ""
Example #7
0
def read_tool_info(tool_dir):
    """ Reads the external tool info file and appends the key/value pairs 
    to given args dictionary. The given args dictionary must contain the 
    path to the root directory of tool. """

    args = {}
    tool_info_file_path = get_tool_info_file_path(tool_dir)

    # Only proceed if the the tool info file exists.
    if file_util.is_missing_or_empty_file(tool_info_file_path):
        return args

    with open(tool_info_file_path) as tool_info_file:
        # Each line of file is of form <key> <TAB> <value>
        for line in tool_info_file:
            key, value = line.strip().split("\t")
            args[key] = value

    return args
Example #8
0
    def create_plain_output(self, raw_output_path):
        """ 
        Formats the given file.
        """
                       
        if file_util.is_missing_or_empty_file(raw_output_path):
            return ""
        
        # Read in the xml.
        xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))

        paragraphs = []
        
        # Extract the title as separate paragraph.
        title_node = xml.find(title_xpath)
        if title_node is not None and title_node.text is not None:
            paragraphs.append(title_node.text)   
        
        variant_node = xml.find(variant_xpath)
        paragraphs.extend(self.find_paragraphs(variant_node))
                            
        return "\n\n".join(paragraphs)
Example #9
0
    def create_plain_output(self, raw_output_path):
        """ 
        Formats the given file.
        """

        if file_util.is_missing_or_empty_file(raw_output_path):
            return ""

        # Read in the xml.
        xml = etree.parse(raw_output_path, etree.XMLParser(recover=True))

        paragraphs = []

        # Extract the title as separate paragraph.
        title_node = xml.find(title_xpath, namespaces=ns)
        if title_node is not None and title_node.text is not None:
            paragraphs.append(title_node.text)

        # Extract paragraphs from the body.
        chapter_nodes = xml.findall(chapters_xpath, namespaces=ns)
        for chapter_node in chapter_nodes:
            paragraphs.extend(self.find_paragraphs(chapter_node))

        return "\n\n".join(paragraphs)
Example #10
0
    def process_gt_file(self, gt_file_path):
        """ 
        Processes the given groundtruth file.
        """
        global counter_processed_gt_files
        global num_total_gt_files

        stats = dotdict()
        stats.gt_file_path = gt_file_path
        # Don't proceed, if the gt file is empty.
        if file_util.is_missing_or_empty_file(gt_file_path):
            stats.is_gt_file_missing = True
            return stats

        # Obtain the path to related tool file.
        tool_file_path = self.get_tool_file_path(gt_file_path)
        stats.tool_file_path = tool_file_path
        # Don't proceed, if the tool file doesn't exist.
        if file_util.is_missing_or_empty_file(tool_file_path):
            stats.is_tool_file_missing = True
            return stats

        # Obtain the path to related pdf file.
        pdf_file_path = self.get_pdf_file_path(gt_file_path)
        stats.pdf_file_path = pdf_file_path
        # Don't proceed, if the pdf file doesn't exist.
        if file_util.is_missing_or_empty_file(pdf_file_path):
            stats.is_pdf_file_missing = True
            return stats

        # Read gt file and tool file.

        # The gt file could contain headers starting with "##" containing some
        # metadata.
        gt_lines = []
        source_tex_file = None
        if not file_util.is_missing_or_empty_file(gt_file_path):
            with open(gt_file_path) as f:
                for line in f:
                    if line.startswith("##source"):
                        _, source_tex_file = line.split("\t")
                    else:
                        gt_lines.append(line)
        gt = "".join(gt_lines)

        #gt = file_util.read_file(gt_file_path)
        tool_output = file_util.read_file(tool_file_path)

        # Compute evaluation result.
        evaluation_result = self.process_strings(gt, tool_output)
        evaluation_result["source_tex_file"] = source_tex_file

        # Lock the counter, because += operation is not atomic
        with counter_processed_gt_files.get_lock():
            counter_processed_gt_files.value += 1

        stats.counter = counter_processed_gt_files.value

        # Handle the result.
        self.handle_evaluation_result(stats, evaluation_result)

        return stats