def handle(self, *args, **options):
        self.xml_runner = XMLRunner()
        self.accumulator = Accumulator()

        count = 0
        while True:
            xml_batch = XMLSubmission.objects.filter(
                year__in=[2014, 2015, 2016, 2017]).exclude(
                    json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=ids).exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=['201540859349100204',])[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=test2016ids).exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(sub_date__regex=r'^8.+2017.*').exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=test2016ids).exclude(json_set=True)[:BATCHSIZE]
            #xml_batch = XMLSubmission.objects.filter(object_id__in=['201523179349302022',]).exclude(json_set=True)

            count += 1
            print count
            if len(xml_batch) == 0:
                break

            self.process_batch(xml_batch)

            # for testing
            if count > LOOP_MAX:
                break
Example #2
0
def fetch_yr_ind(oid_srch_lst):

    # Should we use IRSx or manual concordance? Setup IRSx if using it
    # Requires all object IDs in the file to be from the same year
    irsx_flag = True if int(oid_srch_lst[0][:4]) >= 2015 else False
    xml_runner = XMLRunner() if irsx_flag else None

    yr_ind_new = pd.DataFrame()

    # Iterate through Object IDs and update regularly
    start_time = time.time()
    counter = 0
    for oid in oid_srch_lst:
        yr_ind_new = yr_ind_new.append(
            fetch_ind_row(irsx_flag, xml_runner, oid))
        if counter % upd_intvl == 0:
            elapsed = time.time() - start_time
            logging.info(
                "Read {} forms from current year in {:,.1f} seconds.".format(
                    counter, elapsed))
        counter += 1

    yr_ind_new['990_SRC'] = "AWS FILE DIR"

    return yr_ind_new
Example #3
0
File: main.py Project: FD253/data
def process(request):
    if request.headers.get('x-api-key', '') != API_KEY:
        return 'Not found', 404

    xml_runner = XMLRunner()
    try:
        filing = xml_runner.run_filing(request.args.get('aws_object_id',''))
    except RuntimeError as e:
        return "Error getting XML: {0}".format(str(e)), 400

    try:
        if 'IRS990PF' in filing.list_schedules():
            org = org_from_990pf(filing)
            grants_to_create = grants_from_990pf(filing)
        elif 'IRS990EZ' in filing.list_schedules():
            org = org_from_990ez(filing)
            grants_to_create = []
        elif 'IRS990' in filing.list_schedules():
            org = org_from_990(filing)
            grants_to_create = grants_from_990(filing)
        else:
            raise RuntimeError('No schedule available to parse.')
    except RuntimeError as e:
        return "Error getting org: {0}".format(str(e)), 500

    if org.get('ein', '') == '':
        return "No EIN found", 500

    client = MongoClient(MONGO_URL)
    db = client.springsteen

    timestamp = timestamp_now()
    org['updatedAt'] = timestamp

    existing_org = db.organizations.find_one({'ein': org['ein']})
    if existing_org == None:
        org['createdAt'] = timestamp
        result = db.organizations.insert_one(org)
        org_mongo_id = result.inserted_id
    else:
        org_mongo_id = existing_org['_id']
        if 'lastFilingAt' not in existing_org or parse(existing_org['lastFilingAt']) < parse(org['lastFilingAt']):
            merged_org = {**existing_org, **org}
            if 'createdAt' not in merged_org or merged_org['createdAt'] == 'yo':
                merged_org['createdAt'] = timestamp
            result = db.organizations.find_one_and_update({'_id': existing_org['_id']}, {'$set': merged_org}, return_document=ReturnDocument.AFTER)

    for grant in grants_to_create:
        grant['funder'] = DBRef('organizations', org_mongo_id)
        grant['createdAt'] = timestamp
        grant['updatedAt'] = timestamp

    if len(grants_to_create) > 0:
        # Grants should not be replaced if they are already uploaded for that tax period/funder since they can be modified by other sources after initial upload
        if db.grants.find_one({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']}) == None:
            result = db.grants.delete_many({'funderEIN': org['ein'], 'fromTaxPeriodEnding': grants_to_create[0]['fromTaxPeriodEnding']})
            result = db.grants.insert_many(grants_to_create)

    return 'OK'
Example #4
0
 def run(self):
     self.xml_runner = XMLRunner()
     self.accumulator = Accumulator()
     while True:
         filing = self.queue.get()
         self.run_filing(filing)
         self.queue.task_done()
     connection.close()
Example #5
0
 def run(self):
     self.xml_runner = XMLRunner()
     self.accumulator = Accumulator()
     while True:
          filing = self.queue.get()
          try:
              self.run_filing(filing)
          except Exception as ex:
              print(ex)
          finally:
              self.queue.task_done()
     connection.close()
Example #6
0
    def handle(self, *args, **options):
        self.xml_runner = None
        self.standardizer = Standardizer()
        count = 0

        submissions = XMLSubmission.objects.filter(
            schema_year__gte=2013,
            sub_date__contains='2017').values('taxpayer_name', 'tax_period',
                                              'sub_date', 'object_id')
        for submission in submissions:

            count += 1
            if count % 100 == 0:
                print("Processed %s filings" % count)
                reset_queries()  # not sure this will matter, but...
                self.xml_runner = None  # Erase this to prevent memory leaks

            if not self.xml_runner:
                # will start up faster if we don't have to reread/import csvs
                self.xml_runner = XMLRunner(standardizer=self.standardizer)

            whole_submission = XMLSubmission.objects.get(
                object_id=submission['object_id'])

            if type(whole_submission.as_json) == unicodeType:
                submission_json = json.loads(whole_submission.as_json)
            else:
                # Assume it's a dict?
                # We don't have any "working" installations that return json as json
                submission_json = whole_submission.as_json

            filingobj = Filing(submission['object_id'], json=submission_json)

            parsedFiling = self.xml_runner.run_from_filing_obj(
                filingobj,
                verbose=False,
            )
            result = parsedFiling.get_result()
            keyerrors = parsedFiling.get_keyerrors()
            has_keyerrors = len(keyerrors) > 0

            try:
                ProcessedFiling.objects.get(object_id=submission['object_id'])
            except ProcessedFiling.DoesNotExist:
                ProcessedFiling.objects.create(
                    ein=whole_submission.ein,
                    object_id=whole_submission.object_id,
                    processed_json=result,
                    keyerrors=keyerrors,
                    has_keyerrors=has_keyerrors,
                    submission=whole_submission)
Example #7
0
    def __init__(self, object_id, obj_tbl_field_map=None):
        self.object_id = object_id
        self.xml_runner = XMLRunner()
        self.obj_tbl_field_map = obj_tbl_field_map

        self.header_dict = self.process_header_fields()
        self.balance_dict = self.process_balance_fields()
        self.people = self.process_compensation_fields()

        self.failures = {
            'comp': True if self.people is None else False,
            'balance': True if self.balance_dict is None else False,
            'header': True if self.header_dict is None else False
        }
Example #8
0
def fetch_yr_ind( oid_srch_lst ):

    # Should we use IRSx or manual concordance? Setup IRSx if using it
    # Requires all object IDs in the file to be from the same year
    irsx_flag = True if int( oid_srch_lst[0][:4] ) >= 2015 else False
    xml_runner = XMLRunner() if irsx_flag else None
        
    yr_ind_new = pd.DataFrame()
    
    # Iterate through Object IDs
    for oid in oid_srch_lst:
        yr_ind_new = yr_ind_new.append( fetch_ind_row( irsx_flag, xml_runner, oid ) )
    
    yr_ind_new['990_SRC'] = "AWS FILE DIR"
    
    return yr_ind_new
Example #9
0
def federal_irs_ingest_get_990s(message, context):

    year = datetime.datetime.today().year

    # settings pulled from a database
    ref = db.collection('federal').document('irs').collection('990s').document(str(year))
    settings = ref.get().to_dict()
    if settings is not None:
        latest_saved_idx = settings['idx']
    else:
        latest_saved_idx = 0

    # prep load
    xml_runner = XMLRunner()
    start_time = time.time()
    bucket = client.get_bucket(gcp_project_id)
    blob = bucket.get_blob('downloads/federal/irs/index_' + str(year) + '.csv')
    blob = blob.download_as_string().decode('utf-8')
    blob = StringIO(blob)

    # load by looping through all the rows in the index
    actions = []
    failed_object_ids = []
    reader = csv.reader(blob, delimiter=',')
    next(reader) # skip header
    for idx, row in enumerate(reader):

        if time.time() - start_time > 520:
            break

        # skip previously indexed objectss
        if idx < latest_saved_idx:
            continue

        # process the object id
        object_id = row[8]
        if int(object_id[:4]) < 2014: # can't process these
            continue

        # process the submission date
        sub_date = row[4]
        try:
            sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y %I:%M:%S %p')
        except:
            try:
                sub_date = datetime.datetime.strptime(sub_date, '%m/%d/%Y')
            except:
                raise

        sub_date = pytz.timezone('US/Eastern').localize(sub_date)
        sub_date = sub_date.strftime("%Y-%m-%dT%H:%M:%S%z")

        # grab the filing
        try:
            filing = xml_runner.run_filing(object_id)
            schedules = filing.get_result()
        except (RuntimeError, InvalidXMLException) as e:
            logger.error(object_id, str(e))
            failed_object_ids.append(object_id)
            continue

        if schedules is not None:

            xml = utilities.get_xml_parts(schedules)
            xml = utilities.clean_xml(xml)

            if 'IRS990EZ' in xml:
                index = '990ez'
            elif 'IRS990PF' in xml:
                index = '990pf'
            else:
                index = '990'

            actions.append({
                '_op_type': 'index',
                '_index': 'federal_irs_' + index,
                '_id': object_id,
                '_source': {
                    'row': {
                        'return_id': str(row[0]),
                        'filing_type': row[1],
                        'ein': str(row[2]),
                        'tax_period': row[3],
                        'sub_date': sub_date,
                        'taxpayer_name': row[5],
                        'return_type': str(row[6]),
                        'dln': str(row[7]),
                        'object_id': object_id
                    },
                    'obj': xml,
                    'context': {
                        'last_indexed': datetime.datetime.now(datetime.timezone.utc)
                    }
                }
            })

        if len(actions) >= 1000:
            helpers.bulk(es, actions)
            logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs')
            actions = []

    # index all docs into elasticsearch
    if actions:
        helpers.bulk(es, actions)
        logger.info('ELASTICSEARCH UPDATED' + ' - ' + str(len(actions)) + ' docs')

    # update Firestore
    update = {
        "idx": idx,
        "last_updated": datetime.datetime.now(datetime.timezone.utc)
    }
    if len(failed_object_ids) > 0:
        update['failed_object_ids'] = firestore.ArrayUnion(failed_object_ids)
    ref.set(update, merge=True)

    num_remaining_rows = len(list(reader))
    logger.info('FIRESTORE UPDATED - completed: ' + str(idx) + ', remaining: ' + str(num_remaining_rows))
    return num_remaining_rows
 def __init__(self, output_streams, data_capture_dict, year):
     self.year = year
     self.output_streams = output_streams
     self.data_capture_dict = data_capture_dict
     self.xml_runner = XMLRunner()
     self._init_streams()
Example #11
0
def filing_990_historical(message, context):
    latest_saved_year = settings['latest_year_file']
    latest_saved_idx = settings['latest_index_in_file']
    failed_object_ids = settings['failed_object_ids']
    if latest_saved_year == 2010:
        return True
    xml_runner = XMLRunner()
    start_time = time.time()
    exit = False
    files = os.listdir('indexes')
    actions = []
    for _file in files:
        if _file != str(latest_saved_year) + '.csv':
            continue
        with open('indexes/' + _file, newline='\n') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            next(reader)  # skip header
            for idx, row in enumerate(reader):
                if time.time() - start_time > 520:
                    exit = True
                    break
                if idx < latest_saved_idx:
                    continue
                object_id = row[-1]
                try:
                    filing = xml_runner.run_filing(object_id)
                except (RuntimeError, InvalidXMLException) as e:
                    failed_object_ids.append(object_id)
                    continue
                try:
                    schedules = filing.list_schedules()
                    if 'IRS990PF' in schedules:
                        org = org_from_990pf(filing)
                        grants_to_create = grants_from_990pf(filing)
                    elif 'IRS990EZ' in schedules:
                        org = org_from_990ez(filing)
                        grants_to_create = []
                    elif 'IRS990' in schedules:
                        org = org_from_990(filing)
                        grants_to_create = grants_from_990(filing)
                    else:
                        raise RuntimeError('No schedule available to parse.')
                except (RuntimeError, Exception) as e:
                    failed_object_ids.append(object_id)
                    continue
                actions.append({
                    '_op_type':
                    'index',
                    '_index':
                    'irs-990-filing',
                    '_id':
                    object_id,
                    '_source':
                    json.dumps({
                        'org': org,
                        'grants': grants_to_create
                    })
                })
            else:
                latest_saved_year -= 1
        if exit:
            break
    if actions:
        helpers.bulk(es, actions)
        actions = []
        logger.info('ELASTICSEARCH UPDATED')
    settings['latest_year_file'] = latest_saved_year
    settings['latest_index_in_file'] = idx
    settings['failed_object_ids'] = failed_object_ids
    ref.set(settings)
    logger.info('FIRESTORE UPDATED')
    return True
Example #12
0
 def setup(self):
     # get an XMLRunner -- this is what actually does the parsing
     self.xml_runner = XMLRunner()
     self.accumulator = Accumulator()
Example #13
0
def analyze990(filing_number):
    xml_runner = XMLRunner()
    parsed_filing = xml_runner.run_filing(filing_number)
    result = parsed_filing.get_csv_result()
    print(result)
import csv
import pandas as pd
import requests.exceptions
import time
from irsx.xmlrunner import XMLRunner
from irsx.filing import InvalidXMLException

#Works for schemas: 2013v3.0 through part of 2016. Assuming that there's no way a FY2015 filing could use TY2016 schema.

timestr = time.strftime("%Y-%m-%d-%H-%M")
xml_runner = XMLRunner(documentation=True, csv_format=True)
df = pd.read_csv("files/new_ids.csv")
object_list = list(df["object_id"])

fieldnames = [
    "schema",  #May need to join with the BMF to get the foundation type.
    "object_id",
    "/ReturnHeader/ReturnTs",
    "/ReturnHeader/Filer/EIN",
    #"/ReturnHeader/Filer/Name/BusinessNameLine1", This field has had its name changed multiple times. Just use the name in the BMF.
    "/ReturnHeader/TaxPeriodEndDt",
    "/ReturnHeader/ReturnTypeCd",
    "/ReturnHeader/TaxYr",
    "/IRS990PF/AmendedReturnInd",
    "/IRS990PF/FinalReturnInd",
    "/IRS990PF/StatementsRegardingActyGrp/PrivateOperatingFoundationInd",
    "/IRS990PF/FMVAssetsEOYAmt",
    "/IRS990PF/MethodOfAccountingCashInd",
    "/IRS990PF/AnalysisOfRevenueAndExpenses/OthEmplSlrsWgsRevAndExpnssAmt",
    "/IRS990PF/AnalysisOfRevenueAndExpenses/OthEmplSlrsWgsDsbrsChrtblAmt",
    "/IRS990PF/AnalysisOfRevenueAndExpenses/TotOprExpensesRevAndExpnssAmt",
Example #15
0
    def handle(self, *args, **options):
        self.xml_runner = None
        #self.fix_connection()
        self.standardizer = Standardizer()
        count = 0
        headers = [
            "taxpayer_name", "ein", "tax_period", "sub_date", "object_id",
            "name", "title", "org_comp", "related_comp", "other_cmp", "form",
            "source"
        ]

        outfile = open("dumptest.csv", 'wb')
        dw = csv.DictWriter(outfile, fieldnames=headers, extrasaction='ignore')
        dw.writeheader()

        submissions = XMLSubmission.objects.filter(
            schema_year__gte=2013,
            sub_date__contains='2017').values('taxpayer_name', 'tax_period',
                                              'sub_date', 'object_id')
        #submissions = XMLSubmission.objects.filter(object_id='201513209349102976').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id')
        #submissions = XMLSubmission.objects.filter(return_type='990PF').values('taxpayer_name', 'tax_period', 'sub_date', 'object_id')
        for submission in submissions:

            count += 1
            if count % 100 == 0:
                print("Processed %s filings" % count)
                reset_queries()  # not sure this will matter, but...
                self.xml_runner = None  # Erase this to prevent memory leaks

            if not self.xml_runner:
                self.xml_runner = XMLRunner(
                    standardizer=self.standardizer
                )  # will start up faster if we don't have to reread/import csvs

            whole_submission = XMLSubmission.objects.get(
                object_id=submission['object_id'])
            assert whole_submission.json_set

            # There's a bug that makes json objects get returned as unicode instead of as dicts
            # similar to this one https://code.djangoproject.com/ticket/27675
            # though django-jsonfield isn't used in this object
            # See to register_json, though that doesn't work in this context
            # http://initd.org/psycopg/docs/extras.html

            if type(whole_submission.as_json) == unicodeType:
                submission_json = json.loads(whole_submission.as_json)
            else:
                # Assume it's a dict? We haven't seen this yet.
                submission_json = whole_submission.as_json

            filingobj = Filing(submission['object_id'], json=submission_json)
            #print("\n\nObject id %s\n" % submission['object_id'])
            #print submission_json

            processedFiling = self.xml_runner.run_from_filing_obj(
                filingobj,
                verbose=False,
            )

            #print ("\n\nProcessed filing is %s" % processedFiling.get_result())

            filing_info = {
                'taxpayer_name': submission['taxpayer_name'],
                'tax_period': submission['tax_period'],
                'sub_date': submission['sub_date']
            }
            schedule_list = processedFiling.list_schedules()
            result = processedFiling.get_result()
            keyerrors = processedFiling.get_keyerrors()
            if keyerrors:
                print("\n\n\n***keyerrors\n\n%s" % keyerrors)

            sked990_list = processedFiling.get_parsed_sked('IRS990')
            sked990EZ_list = processedFiling.get_parsed_sked('IRS990EZ')
            sked990PF_list = processedFiling.get_parsed_sked('IRS990PF')
            sked990J_list = processedFiling.get_parsed_sked('IRS990ScheduleJ')

            if sked990_list:
                #print("\n\t990")
                sked990 = sked990_list[0]
                assert sked990['schedule_name'] == 'IRS990'
                group_name = "Frm990PrtVIISctnA"
                try:
                    employee_list = sked990['groups'][group_name]
                except KeyError:
                    employee_list = []

                for employee in employee_list:
                    #print "\n\n"
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm'),
                        'title': employee.get('TtlTxt'),
                        'org_comp': employee.get('RprtblCmpFrmOrgAmt', 0),
                        'related_comp': employee.get('RprtblCmpFrmRltdOrgAmt',
                                                     0),
                        'other_cmp': employee.get('OthrCmpnstnAmt', 0),
                        'highest_ind': employee.get('HghstCmpnstdEmplyInd'),
                        'form': 'IRS990',
                        'source': 'Frm990PrtVIISctnA'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

            if sked990EZ_list:
                sked990EZ = sked990EZ_list[0]
                #print("\n\t990EZ %s" % sked990EZ['schedule_name'])
                assert sked990EZ['schedule_name'] == 'IRS990EZ'
                group_name = "EZOffcrDrctrTrstEmpl"

                try:
                    employee_list = sked990EZ['groups'][group_name]
                except KeyError:
                    employee_list = []

                for employee in employee_list:
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm', ''),
                        'title': employee.get('TtlTxt', ''),
                        'org_comp': employee.get('CmpnstnAmt', 0),
                        # 'related_comp': NA
                        #'other_cmp': EmplyBnftsAmt + ExpnsAccntAmt ?
                        'form': 'IRS990EZ',
                        'source': 'EZOffcrDrctrTrstEmpl'
                    }
                    this_employee.update(filing_info)
                    #print this_employee
                    dw.writerow(this_employee)

                ##

                group_name = "EZCmpnstnHghstPdEmpl"  # This is very rare
                try:
                    employee_list = sked990EZ['groups'][group_name]
                except KeyError:
                    employee_list = []

                for employee in employee_list:

                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm'),
                        'title': employee.get('TtlTxt'),
                        'org_comp': employee.get('CmpnstnAmt'),
                        # 'related_comp': NA
                        #'other_cmp': EmplyBnftsAmt + ExpnsAccntAmt ?
                        'form': 'IRS990EZ',
                        'source': 'EZCmpnstnHghstPdEmpl'
                    }
                    this_employee.update(filing_info)
                    print "\nEZ"
                    print employee
                    print this_employee
                    dw.writerow(this_employee)

            if sked990PF_list:
                sked990PF = sked990PF_list[0]
                #print("\n\t990PF %s" % sked990PF['schedule_name'])
                assert sked990PF['schedule_name'] == 'IRS990PF'

                group_name = "PFOffcrDrTrstKyEmpl"
                employee_list = []
                try:
                    employee_list = sked990PF['groups'][group_name]
                except KeyError:
                    pass

                for employee in employee_list:
                    #print "\n\n"
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('OffcrDrTrstKyEmpl_PrsnNm'),
                        'title': employee.get('OffcrDrTrstKyEmpl_TtlTxt'),
                        'org_comp':
                        employee.get('OffcrDrTrstKyEmpl_CmpnstnAmt'),
                        # 'related_comp': NA
                        #'other_cmp': OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt + OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt ?
                        'form': 'IRS990PF',
                        'source': 'PFOffcrDrTrstKyEmpl'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

                group_name = "PFCmpnstnHghstPdEmpl"  # also rare
                employee_list = []
                try:
                    employee_list = sked990PF['groups'][group_name]
                except KeyError:
                    pass

                for employee in employee_list:
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('CmpnstnHghstPdEmpl_PrsnNm'),
                        'title': employee.get('CmpnstnHghstPdEmpl_TtlTxt'),
                        'org_comp':
                        employee.get('CmpnstnHghstPdEmpl_CmpnstnAmt'),
                        # 'related_comp': NA
                        #'other_cmp': CmpnstnHghstPdEmpl_EmplyBnftsAmt + CmpnstnHghstPdEmpl_ExpnsAccntAmt ?
                        'form': 'IRS990PF',
                        'source': 'PFCmpnstnHghstPdEmpl'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

            if sked990J_list:
                sked990J = sked990J_list[0]
                #print("\n\t990J %s" % sked990J['schedule_name'])
                assert sked990J['schedule_name'] == 'IRS990ScheduleJ'

                group_name = "SkdJRltdOrgOffcrTrstKyEmpl"
                employee_list = []
                try:
                    employee_list = sked990J['groups'][group_name]
                except KeyError:
                    pass

                for employee in employee_list:
                    #print "\n\n sked J"
                    #print employee
                    this_employee = {
                        'ein': employee['ein'],
                        'object_id': employee['object_id'],
                        'name': employee.get('PrsnNm'),
                        'bus_line_1': employee.get('BsnssNmLn1Txt'),
                        'title': employee.get('TtlTxt'),
                        'org_comp': employee.get('TtlCmpnstnFlngOrgAmt'),
                        'related_comp': employee.get('TtlCmpnstnRltdOrgsAmt'),
                        #'other_cmp': OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt + OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt ?
                        'form': 'IRS990ScheduleJ',
                        'source': 'SkdJRltdOrgOffcrTrstKyEmpl'
                    }
                    this_employee.update(filing_info)
                    #print "\n"
                    #print this_employee
                    dw.writerow(this_employee)

        print("Total of %s processed" % count)
Example #16
0
def filing_990_historical(message, context):
    today = datetime.datetime.today()
    year = today.year
    download_current_year_index(year)
    latest_saved_year = settings['latest_year_file']
    latest_saved_idx = settings['latest_index_in_file']
    failed_object_ids = settings['failed_object_ids']
    xml_runner = XMLRunner()
    start_time = time.time()
    actions = []
    with open('/tmp/' + str(year) + '.csv', newline='\n') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)  # skip header
        for idx, row in enumerate(reader):
            if time.time() - start_time > 520:
                break
            if idx < latest_saved_idx:
                continue
            object_id = row[-2]
            if object_id in saved_object_ids or object_id in failed_object_ids:
                continue
            try:
                filing = xml_runner.run_filing(object_id)
            except (RuntimeError, InvalidXMLException) as e:
                failed_object_ids.append(object_id)
                continue
            try:
                schedules = filing.list_schedules()
                if 'IRS990PF' in schedules:
                    org = org_from_990pf(filing)
                    grants_to_create = grants_from_990pf(filing)
                elif 'IRS990EZ' in schedules:
                    org = org_from_990ez(filing)
                    grants_to_create = []
                elif 'IRS990' in schedules:
                    org = org_from_990(filing)
                    grants_to_create = grants_from_990(filing)
                else:
                    raise RuntimeError('No schedule available to parse.')
            except (RuntimeError, Exception) as e:
                failed_object_ids.append(object_id)
                continue
            actions.append({
                '_op_type':
                'index',
                '_index':
                'irs-990-filing',
                '_id':
                object_id,
                '_source':
                json.dumps({
                    'org': org,
                    'grants': grants_to_create
                })
            })
        else:
            if today == datetime.date(day=31, month=12, year=year):
                latest_saved_year += 1
    if actions:
        helpers.bulk(es, actions)
        actions = []
        logger.info('ELASTICSEARCH UPDATED')
    settings['latest_year_file'] = latest_saved_year
    settings['latest_index_in_file'] = idx
    settings['failed_object_ids'] = failed_object_ids
    ref.set(settings)
    logger.info('FIRESTORE UPDATED')
    return True