def _perform_check_and_write_to_results_file(case_pattern, row, row_holder): print "Start:", row.index, row.CIK, case_pattern.pattern, row.case_name # check if CIK is valid. if Utilities.is_CIK_valid(row.CIK): for year in xrange(2004, 2012 + 1): raw_data = _get_raw_data(row.CIK, year) if raw_data is not None: if re.search(case_pattern, raw_data): row.case_mentioned_in_a_10k_for_a_year(year) row_holder.append(row.construct_row_with_ordered_fields())
def _read_ouput_file_and_get_finished_indices(): ''' read the output file and see which indices exist in there. those are the finished rows. also read the rows for the already-learned CIK/plaintiff mapping ''' reader = csv.reader(open(Constants.PATH_TO_NEW_LITIGATION_FILE, 'rb'), delimiter=',') results = set() for row in reader: index = row[0] CIK = row[2] plaintiff = row[3] results.add(index) if Utilities.is_CIK_valid(CIK): _name_to_cik_mapping[plaintiff] = CIK return results
def main(items_to_add): finished_indices = _read_ouput_file_and_get_finished_indices() pool = multiprocessing.Pool(maxtasksperchild=15) row_holder = _manager.list() processed_index_counter = 0 litigation_reader = csv.reader(open(Constants.PATH_TO_LITIGATION_FILE, 'rb'), delimiter=',') for row in litigation_reader: row_object = NewRowGenerator(*row) # already processed. if row_object.index in finished_indices: continue # intentionally skip. if row_object.CIK == Constants.CIK_CODE_TO_INDICATE_ROW_SHOULD_BE_SKIPPED: continue processed_index_counter += 1 if processed_index_counter > items_to_add: break #print "BEGIN:", row # rows always have a plaintiff but not always a CIK. if not Utilities.is_CIK_valid(row_object.CIK): # if this row has the CIK-company name mapping, cache it. # update the key-value pairing with each row iteration # as company CIKSs can change as time goes on. if len(row_object.plaintiff) > 0: _name_to_cik_mapping[row_object.plaintiff] = row_object.CIK else: # this row didnt have a CIK. first, check previous rows for the mapping we want. # if that doesn't exist, use the company name and edgar to get the # potential CIK. result = _get_potential_cik_from_company_name(row_object.plaintiff) if result is not None: row_object.CIK = result if not Utilities.is_CIK_valid(row_object.CIK): print "Error: No CIK. Index:", row_object.index #continue case_pattern = _get_first_word_of_case_name(row_object.case_name) #_perform_check_and_write_to_results_file(case_pattern, row_object, row_holder) pool.apply_async(_perform_check_and_write_to_results_file, \ args=(case_pattern, row_object, row_holder)) pool.close() pool.join() litigation_writer = csv.writer(open(Constants.PATH_TO_NEW_LITIGATION_FILE, 'ab'), delimiter=',') for row in row_holder: litigation_writer.writerow(row)