def read_tags_basic(taglist): tags = {} # the following are used in English patents, and many of them also in chinese and # german patents tags['meta_tags'] = meta_tags(taglist) taglist = [t for t in taglist if t.name != 'date'] tags['headers'] = tags_with_name(taglist, 'heading') tags['paragraphs'] = tags_with_name(taglist, 'p') tags['abstracts'] = tags_with_name(taglist, 'abstract') tags['summaries'] = tags_with_name(taglist, 'summary') tags['related_applications'] = tags_with_name(taglist, 'related-apps') tags['sections'] = tags_with_name(taglist, 'description') tags['claims_sections'] = tags_with_name(taglist, 'claims') tags['claims'] = tags_with_name(taglist, 'claim') tags['claims'] = sorted(tags['claims'], key = lambda x: x.start_index) # chinese patents until 2010 have two relevant named tags inside the description # TODO: see remark in ../lexisnexis.add_description_sections() on refactoring # language-specific code tags['technical-field'] = tags_with_name(taglist, 'technical-field') tags['background-art'] = tags_with_name(taglist, 'background-art') #for t in tags['abstracts']: print t return tags
def headed_sections(tags, max_title_lead=30, separate_headers=True, max_title_follow=30): """ max_title_lead controls how far the title's end can be from the section's beginning for it to still count as that section's header. separate_headers controls whether or not headers are treated as section objects in their own right, or simply have their text subsumed in the section. """ headers = tags_with_name(tags, "title") sections = tags_with_name(tags, "sec") structures = tags_with_name(tags, "STRUCTURE") title_structures = tags_with_type(structures, "TITLE") text_structures = tags_with_matching_type(structures, "TEXT", 0, 4) #print len(headers), len(sections), len(structures), len(title_structures), len(text_structures) matches = [] header_matches = [] for header in headers: for section in sections: if (header.start_index == section.start_index): if separate_headers: section.start_index = header.end_index + 1 header_matches.append(header) matches.append((header, section)) break for title in title_structures: matching_structures=[] for text_structure in text_structures: if (title.start_index < text_structure.start_index + max_title_follow and text_structure.start_index - title.end_index < max_title_lead): matching_structures.append(text_structure) #multiple things can map to a single title so we need to pick the best one if len(matching_structures) >0: best_structure=pick_best_structure(matching_structures) if separate_headers: header_matches.append(title) else: best_structure.start_index=title.start_index matches.append((title, best_structure)) matches.extend(header_matches) return matches
def read_tags(text_file, fact_file, fact_type): """Returns the text as a unicode string as well as a dictionary with the various kinds of tags.""" (text, tags) = load_data(text_file, fact_file, fact_type) if fact_type == 'BAE': structures = tags_with_name(tags, 'STRUCTURE') tag_dictionary = read_tags_bae(structures) else: tag_dictionary = read_tags_basic(tags) return (text, tag_dictionary)
def read_tags_basic(taglist): tags = {} # the following are used in English patents, and many of them also in chinese and # german patents tags['meta_tags'] = meta_tags(taglist) taglist = [t for t in taglist if t.name != 'date'] tags['headers'] = tags_with_name(taglist, 'heading') tags['paragraphs'] = tags_with_name(taglist, 'fs:P') tags['abstracts'] = tags_with_name(taglist, 'fs:AbstractBlock') tags['summaries'] = tags_with_name(taglist, 'summary') tags['related_applications'] = tags_with_name(taglist, 'related-apps') tags['sections'] = tags_with_name(taglist, 'description') tags['claims_sections'] = tags_with_name(taglist, 'claims') tags['claims'] = tags_with_name(taglist, 'claim') tags['claims'] = sorted(tags['claims'], key=lambda x: x.start_index) #for t in tags['abstracts']: print t return tags