def _character_count_test(CIK, filing_year, new_data, corpus_file): parser_alpha_numeric_count = Utilities.get_alpha_numeric_count(''.join(blob for blob in new_data)) with open(corpus_file, 'r') as f: text_from_file = f.read() file_alpha_numeric_count = Utilities.get_alpha_numeric_count(text_from_file) change = (parser_alpha_numeric_count - file_alpha_numeric_count) / file_alpha_numeric_count result = abs(change) < Constants.REGRESSION_CHAR_COUNT_CHANGE_THRESHOLD print "CIK:%r, Year:%r, New Count:%r, " % (CIK, filing_year, parser_alpha_numeric_count), print "Corpus Count:%r, Passed:%r" % (file_alpha_numeric_count, result) if result is False: CorpusAccess.write_comparison_to_file(new_data, text_from_file, CIK, filing_year)
def _transform_list_of_hits_into_result(recorder, record_header): record = ''.join(recorder) #print "original:", record record = _cut_text_if_needed(record) #print "post:", record if re.search("SUBSEQUENT", record_header, re.I): if not _does_section_mention_litigation(record): record = None # almost all records are at least X chars. if not, it's # probably something that we don't want. if record is not None and Utilities.get_alpha_numeric_count(record) < 200: record = None return record
def _get_best_result(results): ''' get the result with the smallest number of alphanumeric characters ''' min_count = 0 return_result = None for result in results: count = Utilities.get_alpha_numeric_count(result) if min_count == 0: min_count = count return_result = result elif count < min_count: min_count = count return_result = result return return_result