Example #1
0
def delete_990s_by_year(year, dynamodb=None):
    if dynamodb is None:
        dynamodb = client()

    try:
        table = dynamodb.Table(TABLE_990S)
        response = table.query(IndexName="year-index",
                               KeyConditionExpression=Key('year').eq(year))
        records = response['Items']
        while 'LastEvaluatedKey' in response:
            response = table.scan(
                ExclusiveStartKey=response['LastEvaluatedKey'])
            records += response['Items']

        batchsize = 25
        for ii in range(0, len(records), batchsize):
            batch_items = records[ii:ii + batchsize]
            for item in batch_items:
                with table.batch_writer() as writer:
                    writer.delete_item(Item=item)

    except botocore.exceptions.ClientError as e:
        logger.error(e.response['Error']['Message'])
        logger.error("Failed deleting records for year: {}".format(year))
        raise
Example #2
0
def add_990s(records=[], dynamodb=None):
    if dynamodb is None:
        dynamodb = client()

    try:
        table = dynamodb.Table(TABLE_990S)
        with table.batch_writer() as writer:
            for item in records:
                writer.put_item(Item=item)
    except botocore.exceptions.ClientError as e:
        # Ignore the ConditionalCheckFailedException, bubble up
        # other exceptions.
        if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
            logger.error(e.response['Error']['Message'])
            logger.error("Failed inserting: {}".format(records))
            raise
    else:
        return True
Example #3
0
def get_990s(key, value, index=None, projected_attrs=None, dynamodb=None):
    if projected_attrs is None:
        projected_attrs = DEFAULT_PROJECTED_ATTRS

    if dynamodb is None:
        dynamodb = client()

    table = dynamodb.Table(TABLE_990S)

    try:
        query_attrs = {'KeyConditionExpression': Key(key).eq(value)}
        if index is not None:
            query_attrs['IndexName'] = index
        if len(projected_attrs) > 0:
            expression_attr_names = {}
            if 'year' in projected_attrs:
                projected_attrs = list(
                    map(lambda p: '#yr'
                        if p == 'year' else p, projected_attrs))
                expression_attr_names['#yr'] = 'year'

            query_attrs['ProjectionExpression'] = ','.join(
                map(lambda a: str(a), projected_attrs))

            if len(expression_attr_names.keys()) > 0:
                query_attrs['ExpressionAttributeNames'] = expression_attr_names

        response = table.query(**query_attrs)
    except botocore.exceptions.ClientError as e:
        logger.error(e.response['Error']['Message'])
    else:
        if 'Item' in response:
            return [response['Item']]
        elif 'Items' in response:
            return response['Items']
        else:
            return []
def lookup_990s_by_ein(ein):
    ein = ein.strip()

    columns = [
        'ein',
        'name',
        'tax_period',
        'return_id',
        'return_type',
        'filing_name',
        'filing_url',
        'filing_date_start',
        'filing_date_end']
    irs_returns = pd.DataFrame(columns=columns)

    if len(ein) < 9:
        logger.info("EIN must be 9 characters or longer")
        return irs_returns

    if not is_integer(ein):
        logger.info("EIN must be an integer")
        return irs_returns

    ein = int(ein)

    items = get_990s_by_ein(
        ein,
        projected_attrs=[
            'id',
            'ein',
            'year',
            'return_type',
            'tax_period',
            'object_id',
            'taxpayer_name',
            'sub_date'])

    xml_ns = {'': 'http://www.irs.gov/efile'}

    try:
        for item in items:
            xml_url = "https://s3.amazonaws.com/irs-form-990/{}_public.xml".format(
                item['object_id'])
            form_990 = load_990(xml_url)

            r_name, r_tax_period, r_return_id, r_return_type, r_filing_name, r_filing_url, r_filing_date_start, r_filing_date_end = None, None, None, None, None, None, None, None

            root = ElementTree.fromstring(form_990)
            return_header = root.find('ReturnHeader', xml_ns)
            filer_details = return_header.find('Filer', xml_ns)

            tax_year = return_header.find('TaxYr', xml_ns).text

            r_name = filer_details.find(
                'BusinessName', xml_ns).find(
                'BusinessNameLine1Txt', xml_ns).text
            r_return_id = item['id']
            r_return_type = item['return_type']
            r_tax_period = item['tax_period']
            r_filing_name = "{} Form {} Filing".format(tax_year, item['return_type'])
            r_filing_url = "https://apps.irs.gov/pub/epostcard/cor/{}_{}_{}_{}{}.pdf".format(
                ein,
                item['tax_period'],
                item['return_type'],
                dateparser.parse(item['sub_date']).strftime('%Y%m%d'),
                item['id'])
            r_filing_date_start = return_header.find(
                'TaxPeriodBeginDt', xml_ns).text
            r_filing_date_start = dateparser.parse(
                r_filing_date_start).strftime('%m-%d-%Y')
            r_filing_date_end = return_header.find(
                'TaxPeriodEndDt', xml_ns).text
            r_filing_date_end = dateparser.parse(
                r_filing_date_end).strftime('%m-%d-%Y')

            row = pd.DataFrame([[ein,
                                 r_name,
                                 r_tax_period,
                                 r_return_id,
                                 r_return_type,
                                 r_filing_name,
                                 r_filing_url,
                                 r_filing_date_start,
                                 r_filing_date_end]],
                               columns=columns)
            irs_returns = irs_returns.append(row)

            logger.info("Finished {}".format(ein))
    except Exception as err:
        traceback.print_exc()
        logger.error(err)

    return irs_returns
def lookup_990s_by_ein(ein, ignore_tax_periods=[]):
    ein = ein.strip()

    columns = [
        'ein', 'name', 'tax_period', 'return_id', 'return_type', 'filing_name',
        'filing_url', 'filing_date_start', 'filing_date_end'
    ]
    irs_returns = pd.DataFrame(columns=columns)

    if len(ein) < 9:  # EIN has to be at least 9 characters
        return irs_returns
    else:
        logger.info("Loading {}'s 990 data".format(ein))

        url = 'https://apps.irs.gov/app/eos/displayCopyOfReturns.do?dispatchMethod=displayCORInfo&CopyOfReturnId=1211739&ein={}&country=US&deductibility=all&dispatchMethod=searchCopyOfReturns&isDescending=false&city=&ein1={}&postDateFrom=&exemptTypeCode=al&submitName=Search&sortColumn=orgName&totalResults=1&names=&resultsPerPage=25&indexOfFirstRow=0&postDateTo=&state=All+States'.format(
            ein, ein)
        page = requests.get(url)
        soup = bs4.BeautifulSoup(page.text, 'html.parser')

        # with open("page.html", "w") as html_file:
        #     html_file.write(str(soup))

        rows = soup.find_all(id=re.compile(r"copyOfReturns\d*"))
        try:
            for row in rows:
                returns = row.find_all(['b', 'br', 'span', 'a'])

                # Convert the chunk of filing html into a simple list of values
                def flattened_details(tags):
                    values = []
                    for tag in tags:
                        if not isinstance(tag, bs4.NavigableString) and len(
                                list(tag.children)) > 0:
                            if isinstance(tag, bs4.Tag) and tag.name == 'a':
                                values.append('https://apps.irs.gov{}'.format(
                                    tag.attrs['href']))
                            values += flattened_details(list(tag.children))
                        else:
                            if isinstance(tag, bs4.NavigableString):
                                value = str(tag).strip()
                            else:
                                value = tag.getText().strip()

                            if value != "":
                                values.append(value)
                    return values

                details = flattened_details(returns)

                r_name, r_tax_period, r_return_id, r_return_type, r_filing_name, r_filing_url, r_filing_date_start, r_filing_date_end = None, None, None, None, None, None, None, None
                ii = 0
                while ii < len(details):
                    if details[ii].lower() == 'organization name:':
                        r_name = details[ii + 1]
                        ii += 1
                        continue

                    elif r_tax_period is None and details[ii].lower(
                    ) == 'tax period:':
                        r_tax_period = details[ii + 1]
                        ii += 1
                        continue

                    elif details[ii].lower() == 'return id:':
                        r_return_id = details[ii + 1]
                        ii += 1
                        continue

                    elif details[ii].lower() == 'copy of return:':
                        r_filing_url = details[ii + 1]
                        r_filing_name = details[ii + 2]
                        ii += 2
                        continue

                    elif r_return_type is None and details[ii].lower(
                    ) == 'return type:':
                        r_return_type = details[ii + 1]
                        ii += 1
                        continue

                    ii += 1

                if len(ignore_tax_periods) > 0:
                    if r_tax_period in ignore_tax_periods:
                        logger.warn("Skipping {}'s Tax Period '{}'".format(
                            ein, r_tax_period))
                        continue

                if r_filing_url is not None:
                    logger.info("Fetching {}'s '{}' (ReturnId={})".format(
                        ein, r_filing_name, r_return_id))
                    text_990 = load_990(r_filing_url)

                    date_reg = r"((?:(?:\d{1,2}|j(?:an(?:uary)?|(?:u(?:ne|n|ly|l))))|(?:feb(?:ruary)?)|(?:mar(?:ch)?)|(?:may)|(?:a(?:pr(?:il)?|ug(?:ust)?))|(?:(?:sep(?:t)?|nov|dec)(?:em(?:ber)?)?)|(?:oct(?:ober|ob)?))).*?(\d{1,2}).*?(\d{2}\W\d{2}|(?:\d{4}|\d{2}[^\d\%]))"

                    filing_date_beginning_regex = re.compile(
                        "beg(?:i?n?n?i?n?g?)?.*?" + date_reg,
                        flags=re.IGNORECASE | re.S)
                    filing_date_ending_regex = re.compile(
                        r"(?:^|\s){1}endi(?:n?g?)?.*?" + date_reg,
                        flags=re.IGNORECASE | re.S)

                    filing_date_beginning_match = re.search(
                        filing_date_beginning_regex, text_990)
                    if filing_date_beginning_match is not None:
                        r_filing_date_start = "{}-{}-{}".format(
                            filing_date_beginning_match.group(1).replace(
                                ' ', '').strip(),
                            filing_date_beginning_match.group(2).replace(
                                ' ', '').strip(),
                            filing_date_beginning_match.group(3).replace(
                                ' ', '').strip())
                        r_filing_date_start = re.sub(r"\s", "",
                                                     r_filing_date_start)
                        r_filing_date_start = dateparser.parse(
                            r_filing_date_start).strftime('%m-%d-%Y')

                    filing_date_ending_match = re.search(
                        filing_date_ending_regex, text_990)
                    if filing_date_ending_match is not None:
                        r_filing_date_end = "{}-{}-{}".format(
                            filing_date_ending_match.group(1).replace(
                                ' ', '').strip(),
                            filing_date_ending_match.group(2).replace(
                                ' ', '').strip(),
                            filing_date_ending_match.group(3).replace(
                                ' ', '').strip())
                        r_filing_date_end = re.sub(r"\s", "",
                                                   r_filing_date_end)
                        r_filing_date_end = dateparser.parse(
                            r_filing_date_end).strftime('%m-%d-%Y')

                row = pd.DataFrame([[
                    ein, r_name, r_tax_period, r_return_id, r_return_type,
                    r_filing_name, r_filing_url, r_filing_date_start,
                    r_filing_date_end
                ]],
                                   columns=columns)

                irs_returns = irs_returns.append(row)

            logger.info("Finished {}".format(ein))
        except Exception as err:
            traceback.print_exc()
            logger.error(err)

        irs_returns.ein = irs_returns.ein.astype(int)
        irs_returns.return_id = irs_returns.return_id.astype(int)

        return irs_returns