Ejemplo n.º 1
0
def check_whether_header_is_valuable(location, hits):
    
    header = get_header_of_chunk(location, hits)
    
    #print "CHECKING HEADER:", header
    
    # header *has* to contain some special keywords.
    contains_keyword = False    
    for word in header:
        if re.match(lfp.headerpatternrepository.get_pattern_of_headers_we_want(), word):
            contains_keyword = True
            #print word, header
            break
    
    if not contains_keyword:
        return False

    # now check for common bigrams that we don't want.
    compressed_header = ''.join(header)
    
    #print header
    #print compressed_header
    
    if re.search("Debt|Other|Environmental|Proceeding", compressed_header, re.I) and \
    not re.search("Litigation|Contingenc|Commitment|" + \
                      "Contigencies|Legal|Subsequent", compressed_header, re.I):
        return False
    
    for regex in lfp.headerpatternrepository.get_patterns_of_headers_we_dont_want():
        if re.search(regex, compressed_header):
            #print "MATCH ON BAD REGEX"
            return False
    
    # we only want subsequent event headers; nothing more.
    if re.search("S[uU][bB][sS][eE][qQ][uU][eE][nN][tT]", compressed_header) \
    and not re.search("Subsequent.*?Event", compressed_header, re.I):
        #print "match on sub"
        return False
    
    # first words are never numbers.
    if Utilities.contains_numbers(header[0]) \
    or (len(header) >= 2 and Utilities.contains_numbers(header[1])):
        #print "MATCH ON NUMBER"
        return False
    
    # does it have a letter in parentheses that is not "A"? 
    # if so, forget it.
    if re.search("\([B-Zb-z]\)", compressed_header):
        return False
    
    return True
Ejemplo n.º 2
0
def was_cut_within_a_table(location, hits):
            
    last_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits)
    
    if last_sentence_fragment is None:
        return False
    
    compressed_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits, return_as_string=True)

    #print "FRAGMENT:", compressed_sentence_fragment

    # see whether we picked up a table. 
    # tables normally have units of currency as well as the word follows somewhere.
    # if these hold, then we're probably in a table from a previous section.
    # that means that if we're in a relevant section right now, and the new hit demarcates a new section,
    # then we want to stop recording. if we're not recording, then we probably want to start.
    # if we're in a relevant section and the new hit does *not* have a header that's been whitelisted as being
    # a section, then we can continue recording.
    if re.search("(in)?\s*(millions|thousands|billions)", compressed_sentence_fragment, re.I | re.M | re.S) \
    and re.search("total|follow(s|ing)|balance", compressed_sentence_fragment, re.I | re.M | re.S):
        #print "MATCH ON currency"
        #print 'MATCH ON FOLLOWS|total'
        return True
        
    char_frequency = Utilities.character_counter(compressed_sentence_fragment, '$')
    
    if char_frequency['$'] >= 6:
        #print 'MATCH ON DOLLAR COUNT'
        return True
    
    number_count = 0
    for word in last_sentence_fragment:
        if Utilities.contains_numbers(word):
            number_count += 1
            
    if re.search("total|follow(s|ing)|balance", compressed_sentence_fragment, re.I | re.M | re.S) \
    and number_count >= 6:
        #print "match on number count"
        return True
    
    return False