예제 #1
0
def are_there_more_left_parentheses_than_right_parentheses(location, hits):
    
    last_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits, return_as_string=True) 
    char_frequency = Utilities.character_counter(last_sentence_fragment, '(', ')')
        
    if char_frequency['('] > char_frequency[')']:
        return True
    
    return False
예제 #2
0
def was_cut_within_a_table(location, hits):
            
    last_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits)
    
    if last_sentence_fragment is None:
        return False
    
    compressed_sentence_fragment = lfp.wordtokencreation.get_last_sentence_fragment(location, hits, return_as_string=True)

    #print "FRAGMENT:", compressed_sentence_fragment

    # see whether we picked up a table. 
    # tables normally have units of currency as well as the word follows somewhere.
    # if these hold, then we're probably in a table from a previous section.
    # that means that if we're in a relevant section right now, and the new hit demarcates a new section,
    # then we want to stop recording. if we're not recording, then we probably want to start.
    # if we're in a relevant section and the new hit does *not* have a header that's been whitelisted as being
    # a section, then we can continue recording.
    if re.search("(in)?\s*(millions|thousands|billions)", compressed_sentence_fragment, re.I | re.M | re.S) \
    and re.search("total|follow(s|ing)|balance", compressed_sentence_fragment, re.I | re.M | re.S):
        #print "MATCH ON currency"
        #print 'MATCH ON FOLLOWS|total'
        return True
        
    char_frequency = Utilities.character_counter(compressed_sentence_fragment, '$')
    
    if char_frequency['$'] >= 6:
        #print 'MATCH ON DOLLAR COUNT'
        return True
    
    number_count = 0
    for word in last_sentence_fragment:
        if Utilities.contains_numbers(word):
            number_count += 1
            
    if re.search("total|follow(s|ing)|balance", compressed_sentence_fragment, re.I | re.M | re.S) \
    and number_count >= 6:
        #print "match on number count"
        return True
    
    return False