def assign_labels(self, xml_file): XML_Parser.parse_file(xml_file) used_segments = list() text_list = list() for section in self.sections: used_segments.extend(section.segments) if section.text() != "": text_list.append(section.text()) for page in self.pages: for segment in page.segments: if segment not in used_segments and self._is_within_flow_bounds( segment): if segment.text(): text_list.append(segment.text()) text_list, tags, candidate_matrix = XML_Parser.generate_candidate_matrix( text_list) used_segments, used_rows, used_cols = [], [], [] max_distance = np.max(candidate_matrix) min_distance = np.min(candidate_matrix) while len(used_cols) < len(tags): best_pos = (0, 0) best_score = float("inf") for row in range(candidate_matrix.shape[0]): for col in range(candidate_matrix.shape[1]): if row not in used_rows and col not in used_cols: score = candidate_matrix[row][col] if score < best_score: best_score = score best_pos = (row, col) used_rows.append(best_pos[0]) used_cols.append(best_pos[1]) if best_pos[0] < len(self.sections): for segment in self.sections[best_pos[0]].segments: segment.tag = tags[best_pos[1]][1] segment.prob_tag = math.exp(-10 * (best_score - min_distance) / (max_distance - min_distance)) used_segments.append(segment) else: for page in self.pages: for segment in page.segments: if segment not in used_segments: if segment.text() == text_list[best_pos[0]]: segment.tag = tags[best_pos[1]][1] segment.prob_tag = math.exp( -10 * (best_score - min_distance) / (max_distance - min_distance))
def assign_labels(self, xml_file): XML_Parser.parse_file(xml_file) used_segments = list() text_list = list() for section in self.sections: used_segments.extend(section.segments) if section.text() != "": text_list.append(section.text()) for page in self.pages: for segment in page.segments: if segment not in used_segments and self._is_within_flow_bounds(segment): if segment.text(): text_list.append(segment.text()) text_list, tags, candidate_matrix = XML_Parser.generate_candidate_matrix(text_list) used_segments, used_rows, used_cols = [], [], [] max_distance = np.max(candidate_matrix) min_distance = np.min(candidate_matrix) while len(used_cols) < len(tags): best_pos = (0, 0) best_score = float("inf") for row in range(candidate_matrix.shape[0]): for col in range(candidate_matrix.shape[1]): if row not in used_rows and col not in used_cols: score = candidate_matrix[row][col] if score < best_score: best_score = score best_pos = (row, col) used_rows.append(best_pos[0]) used_cols.append(best_pos[1]) if best_pos[0] < len(self.sections): for segment in self.sections[best_pos[0]].segments: segment.tag = tags[best_pos[1]][1] segment.prob_tag = math.exp( -10* (best_score - min_distance) / (max_distance - min_distance) ) used_segments.append(segment) else: for page in self.pages: for segment in page.segments: if segment not in used_segments: if segment.text() == text_list[best_pos[0]]: segment.tag = tags[best_pos[1]][1] segment.prob_tag = math.exp( -10* (best_score - min_distance) / (max_distance - min_distance) )
page_number=page_count + 1, jpg=page_images[page_count]) page.find_segment_top_neighbors() pages.append(page) page_count += 1 fp.close() pdfArticle = Article(pages, pdf_name) pdfArticle.find_default_fonts() pdfArticle.find_content_distances() pdfArticle.save_content(style="lines") pdfArticle.concatenate_segments() pdfArticle.identify_num_columns() pdfArticle.identify_sections() pdfArticle.save_images(image_folder) if xml_file != "": if label_mode == "A" or label_mode == "a": pdfArticle.assign_labels(xml_file) pdfArticle.print_label_accuracy() else: feature_vecs = XML_Parser.retrieve_tags(xml_file) feature_vecs.sort(key=lambda x: x[1]) label_assignment_gui(feature_vecs, pdfArticle) pdfArticle.save_content(style="segments") #pdfArticle.extract_text() #pdfArticle.plot_stats()
pdfArticle = Article(pages, pdf_name) pdfArticle.find_default_fonts() pdfArticle.find_content_distances() pdfArticle.save_content(style="lines") pdfArticle.concatenate_segments() pdfArticle.identify_num_columns() pdfArticle.identify_sections() pdfArticle.save_images(image_folder) if xml_file != "": if label_mode == "A" or label_mode == "a": pdfArticle.assign_labels(xml_file) pdfArticle.print_label_accuracy() else: feature_vecs = XML_Parser.retrieve_tags(xml_file) feature_vecs.sort(key=lambda x:x[1]) label_assignment_gui(feature_vecs, pdfArticle) pdfArticle.save_content(style="segments") #pdfArticle.extract_text() #pdfArticle.plot_stats()