Esempio n. 1
0
def read_tags_basic(taglist):

    tags = {}

    # the following are used in English patents, and many of them also in chinese and
    # german patents
    tags['meta_tags'] = meta_tags(taglist)
    taglist = [t for t in taglist if t.name != 'date']
    tags['headers'] = tags_with_name(taglist, 'heading')
    tags['paragraphs'] = tags_with_name(taglist, 'p')
    tags['abstracts'] =  tags_with_name(taglist, 'abstract')
    tags['summaries'] = tags_with_name(taglist, 'summary')
    tags['related_applications'] = tags_with_name(taglist, 'related-apps')
    tags['sections'] = tags_with_name(taglist, 'description')
    tags['claims_sections'] = tags_with_name(taglist, 'claims')
    tags['claims'] = tags_with_name(taglist, 'claim')
    tags['claims'] = sorted(tags['claims'], key = lambda x: x.start_index)

    # chinese patents until 2010 have two relevant named tags inside the description
    # TODO: see remark in ../lexisnexis.add_description_sections() on refactoring
    # language-specific code
    tags['technical-field'] = tags_with_name(taglist, 'technical-field')
    tags['background-art'] = tags_with_name(taglist, 'background-art')

    #for t in tags['abstracts']: print t

    return tags
Esempio n. 2
0
def headed_sections(tags, max_title_lead=30, separate_headers=True, max_title_follow=30):
    """
    max_title_lead controls how far the title's end can be from the section's beginning
    for it to still count as that section's header. separate_headers controls whether
    or not headers are treated as section objects in their own right, or simply have
    their text subsumed in the section.
    """
    
    headers = tags_with_name(tags, "title")
    sections = tags_with_name(tags, "sec")
    structures = tags_with_name(tags, "STRUCTURE")
    title_structures = tags_with_type(structures, "TITLE")
    text_structures = tags_with_matching_type(structures, "TEXT", 0, 4)

    #print len(headers), len(sections), len(structures), len(title_structures), len(text_structures)
    
    matches = []
    header_matches = []
    for header in headers:
        for section in sections:
            if (header.start_index == section.start_index):
                if separate_headers:
                    section.start_index = header.end_index + 1
                    header_matches.append(header)
                matches.append((header, section))
                break
    for title in title_structures:
        matching_structures=[]
        for text_structure in text_structures:
            if (title.start_index < text_structure.start_index + max_title_follow
                and text_structure.start_index - title.end_index < max_title_lead):
                matching_structures.append(text_structure)
        #multiple things can map to a single title so we need to pick the best one
        if len(matching_structures) >0:
            best_structure=pick_best_structure(matching_structures)
            if  separate_headers:
                header_matches.append(title)
            else:
                best_structure.start_index=title.start_index
            matches.append((title, best_structure))
                
            
    matches.extend(header_matches)
    return matches
Esempio n. 3
0
def read_tags(text_file, fact_file, fact_type):
    """Returns the text as a unicode string as well as a dictionary with the various kinds
    of tags."""
    (text, tags) = load_data(text_file, fact_file, fact_type)
    if fact_type == 'BAE':
        structures = tags_with_name(tags, 'STRUCTURE')
        tag_dictionary = read_tags_bae(structures)
    else:
        tag_dictionary = read_tags_basic(tags)
    return (text, tag_dictionary)
Esempio n. 4
0
def read_tags_basic(taglist):

    tags = {}

    # the following are used in English patents, and many of them also in chinese and
    # german patents
    tags['meta_tags'] = meta_tags(taglist)
    taglist = [t for t in taglist if t.name != 'date']
    tags['headers'] = tags_with_name(taglist, 'heading')
    tags['paragraphs'] = tags_with_name(taglist, 'fs:P')
    tags['abstracts'] = tags_with_name(taglist, 'fs:AbstractBlock')
    tags['summaries'] = tags_with_name(taglist, 'summary')
    tags['related_applications'] = tags_with_name(taglist, 'related-apps')
    tags['sections'] = tags_with_name(taglist, 'description')
    tags['claims_sections'] = tags_with_name(taglist, 'claims')
    tags['claims'] = tags_with_name(taglist, 'claim')
    tags['claims'] = sorted(tags['claims'], key=lambda x: x.start_index)

    #for t in tags['abstracts']: print t

    return tags