Ejemplo n.º 1
0
def _perform_check_and_write_to_results_file(case_pattern, row, row_holder):

    print "Start:", row.index, row.CIK, case_pattern.pattern, row.case_name

    # check if CIK is valid.
    if Utilities.is_CIK_valid(row.CIK):
        for year in xrange(2004, 2012 + 1):

            raw_data = _get_raw_data(row.CIK, year)

            if raw_data is not None:
                if re.search(case_pattern, raw_data):
                    row.case_mentioned_in_a_10k_for_a_year(year)

    row_holder.append(row.construct_row_with_ordered_fields())
Ejemplo n.º 2
0
def _read_ouput_file_and_get_finished_indices():
    '''
        read the output file and see which indices exist in there. those are the finished rows.
        also read the rows for the already-learned CIK/plaintiff mapping
    '''
    reader = csv.reader(open(Constants.PATH_TO_NEW_LITIGATION_FILE, 'rb'), delimiter=',')

    results = set()

    for row in reader:
        index = row[0]
        CIK = row[2]
        plaintiff = row[3]

        results.add(index)
        if Utilities.is_CIK_valid(CIK):
            _name_to_cik_mapping[plaintiff] = CIK

    return results
Ejemplo n.º 3
0
def main(items_to_add):

    finished_indices = _read_ouput_file_and_get_finished_indices()

    pool = multiprocessing.Pool(maxtasksperchild=15)

    row_holder = _manager.list()

    processed_index_counter = 0

    litigation_reader = csv.reader(open(Constants.PATH_TO_LITIGATION_FILE, 'rb'), delimiter=',')

    for row in litigation_reader:

        row_object = NewRowGenerator(*row)

        # already processed.
        if row_object.index in finished_indices:
            continue

        # intentionally skip.
        if row_object.CIK == Constants.CIK_CODE_TO_INDICATE_ROW_SHOULD_BE_SKIPPED:
            continue

        processed_index_counter += 1

        if processed_index_counter > items_to_add:
            break

        #print "BEGIN:", row

        # rows always have a plaintiff but not always a CIK.
        if not Utilities.is_CIK_valid(row_object.CIK):
            # if this row has the CIK-company name mapping, cache it.
            # update the key-value pairing with each row iteration
            # as company CIKSs can change as time goes on.
            if len(row_object.plaintiff) > 0:
                _name_to_cik_mapping[row_object.plaintiff] = row_object.CIK

        else:
            # this row didnt have a CIK. first, check previous rows for the mapping we want.
            # if that doesn't exist, use the company name and edgar to get the
            # potential CIK.
            result = _get_potential_cik_from_company_name(row_object.plaintiff)
            if result is not None:
                row_object.CIK = result

        if not Utilities.is_CIK_valid(row_object.CIK):
            print "Error: No CIK. Index:", row_object.index
            #continue

        case_pattern = _get_first_word_of_case_name(row_object.case_name)

        #_perform_check_and_write_to_results_file(case_pattern, row_object, row_holder)

        pool.apply_async(_perform_check_and_write_to_results_file, \
            args=(case_pattern, row_object, row_holder))

    pool.close()
    pool.join()

    litigation_writer = csv.writer(open(Constants.PATH_TO_NEW_LITIGATION_FILE, 'ab'), delimiter=',')
    for row in row_holder:
        litigation_writer.writerow(row)