Beispiel #1
0
    def assign_labels(self, xml_file):
        XML_Parser.parse_file(xml_file)

        used_segments = list()
        text_list = list()
        for section in self.sections:
            used_segments.extend(section.segments)
            if section.text() != "":
                text_list.append(section.text())

        for page in self.pages:
            for segment in page.segments:
                if segment not in used_segments and self._is_within_flow_bounds(
                        segment):
                    if segment.text():
                        text_list.append(segment.text())

        text_list, tags, candidate_matrix = XML_Parser.generate_candidate_matrix(
            text_list)

        used_segments, used_rows, used_cols = [], [], []
        max_distance = np.max(candidate_matrix)
        min_distance = np.min(candidate_matrix)

        while len(used_cols) < len(tags):
            best_pos = (0, 0)
            best_score = float("inf")
            for row in range(candidate_matrix.shape[0]):
                for col in range(candidate_matrix.shape[1]):
                    if row not in used_rows and col not in used_cols:
                        score = candidate_matrix[row][col]
                        if score < best_score:
                            best_score = score
                            best_pos = (row, col)

            used_rows.append(best_pos[0])
            used_cols.append(best_pos[1])
            if best_pos[0] < len(self.sections):
                for segment in self.sections[best_pos[0]].segments:
                    segment.tag = tags[best_pos[1]][1]
                    segment.prob_tag = math.exp(-10 *
                                                (best_score - min_distance) /
                                                (max_distance - min_distance))
                    used_segments.append(segment)
            else:
                for page in self.pages:
                    for segment in page.segments:
                        if segment not in used_segments:
                            if segment.text() == text_list[best_pos[0]]:
                                segment.tag = tags[best_pos[1]][1]
                                segment.prob_tag = math.exp(
                                    -10 * (best_score - min_distance) /
                                    (max_distance - min_distance))
Beispiel #2
0
    def assign_labels(self, xml_file):
        XML_Parser.parse_file(xml_file)

        used_segments = list()
        text_list = list()
        for section in self.sections:
            used_segments.extend(section.segments)
            if section.text() != "":
                text_list.append(section.text())

        for page in self.pages:
            for segment in page.segments:
                if segment not in used_segments and self._is_within_flow_bounds(segment):
                    if segment.text():
                        text_list.append(segment.text())

        text_list, tags, candidate_matrix = XML_Parser.generate_candidate_matrix(text_list)

        used_segments, used_rows, used_cols = [], [], []
        max_distance = np.max(candidate_matrix)
        min_distance = np.min(candidate_matrix)

        while len(used_cols) < len(tags):
            best_pos = (0, 0)
            best_score = float("inf")
            for row in range(candidate_matrix.shape[0]):
                for col in range(candidate_matrix.shape[1]):
                    if row not in used_rows and col not in used_cols:
                        score = candidate_matrix[row][col]
                        if score < best_score:
                            best_score = score
                            best_pos = (row, col)

            used_rows.append(best_pos[0])
            used_cols.append(best_pos[1])
            if best_pos[0] < len(self.sections):
                for segment in self.sections[best_pos[0]].segments:
                    segment.tag = tags[best_pos[1]][1]
                    segment.prob_tag = math.exp( -10* (best_score - min_distance) / (max_distance - min_distance) )
                    used_segments.append(segment)
            else:
                for page in self.pages:
                    for segment in page.segments:
                        if segment not in used_segments:
                            if segment.text() == text_list[best_pos[0]]:
                                segment.tag = tags[best_pos[1]][1]
                                segment.prob_tag = math.exp( -10* (best_score - min_distance) / (max_distance - min_distance) )
Beispiel #3
0
                    page_number=page_count + 1,
                    jpg=page_images[page_count])
        page.find_segment_top_neighbors()
        pages.append(page)
        page_count += 1

    fp.close()

    pdfArticle = Article(pages, pdf_name)
    pdfArticle.find_default_fonts()
    pdfArticle.find_content_distances()
    pdfArticle.save_content(style="lines")
    pdfArticle.concatenate_segments()
    pdfArticle.identify_num_columns()
    pdfArticle.identify_sections()
    pdfArticle.save_images(image_folder)

    if xml_file != "":
        if label_mode == "A" or label_mode == "a":
            pdfArticle.assign_labels(xml_file)
            pdfArticle.print_label_accuracy()
        else:
            feature_vecs = XML_Parser.retrieve_tags(xml_file)
            feature_vecs.sort(key=lambda x: x[1])
            label_assignment_gui(feature_vecs, pdfArticle)

    pdfArticle.save_content(style="segments")
    #pdfArticle.extract_text()

    #pdfArticle.plot_stats()
Beispiel #4
0
    pdfArticle = Article(pages, pdf_name)
    pdfArticle.find_default_fonts()
    pdfArticle.find_content_distances()
    pdfArticle.save_content(style="lines")
    pdfArticle.concatenate_segments()
    pdfArticle.identify_num_columns()
    pdfArticle.identify_sections()
    pdfArticle.save_images(image_folder)

    if xml_file != "":
        if label_mode == "A" or label_mode == "a":
            pdfArticle.assign_labels(xml_file)
            pdfArticle.print_label_accuracy()
        else:
            feature_vecs = XML_Parser.retrieve_tags(xml_file)
            feature_vecs.sort(key=lambda x:x[1])
            label_assignment_gui(feature_vecs, pdfArticle)

    pdfArticle.save_content(style="segments")
    #pdfArticle.extract_text()

    #pdfArticle.plot_stats()