def delete_990s_by_year(year, dynamodb=None): if dynamodb is None: dynamodb = client() try: table = dynamodb.Table(TABLE_990S) response = table.query(IndexName="year-index", KeyConditionExpression=Key('year').eq(year)) records = response['Items'] while 'LastEvaluatedKey' in response: response = table.scan( ExclusiveStartKey=response['LastEvaluatedKey']) records += response['Items'] batchsize = 25 for ii in range(0, len(records), batchsize): batch_items = records[ii:ii + batchsize] for item in batch_items: with table.batch_writer() as writer: writer.delete_item(Item=item) except botocore.exceptions.ClientError as e: logger.error(e.response['Error']['Message']) logger.error("Failed deleting records for year: {}".format(year)) raise
def add_990s(records=[], dynamodb=None): if dynamodb is None: dynamodb = client() try: table = dynamodb.Table(TABLE_990S) with table.batch_writer() as writer: for item in records: writer.put_item(Item=item) except botocore.exceptions.ClientError as e: # Ignore the ConditionalCheckFailedException, bubble up # other exceptions. if e.response['Error']['Code'] != 'ConditionalCheckFailedException': logger.error(e.response['Error']['Message']) logger.error("Failed inserting: {}".format(records)) raise else: return True
def get_990s(key, value, index=None, projected_attrs=None, dynamodb=None): if projected_attrs is None: projected_attrs = DEFAULT_PROJECTED_ATTRS if dynamodb is None: dynamodb = client() table = dynamodb.Table(TABLE_990S) try: query_attrs = {'KeyConditionExpression': Key(key).eq(value)} if index is not None: query_attrs['IndexName'] = index if len(projected_attrs) > 0: expression_attr_names = {} if 'year' in projected_attrs: projected_attrs = list( map(lambda p: '#yr' if p == 'year' else p, projected_attrs)) expression_attr_names['#yr'] = 'year' query_attrs['ProjectionExpression'] = ','.join( map(lambda a: str(a), projected_attrs)) if len(expression_attr_names.keys()) > 0: query_attrs['ExpressionAttributeNames'] = expression_attr_names response = table.query(**query_attrs) except botocore.exceptions.ClientError as e: logger.error(e.response['Error']['Message']) else: if 'Item' in response: return [response['Item']] elif 'Items' in response: return response['Items'] else: return []
def lookup_990s_by_ein(ein): ein = ein.strip() columns = [ 'ein', 'name', 'tax_period', 'return_id', 'return_type', 'filing_name', 'filing_url', 'filing_date_start', 'filing_date_end'] irs_returns = pd.DataFrame(columns=columns) if len(ein) < 9: logger.info("EIN must be 9 characters or longer") return irs_returns if not is_integer(ein): logger.info("EIN must be an integer") return irs_returns ein = int(ein) items = get_990s_by_ein( ein, projected_attrs=[ 'id', 'ein', 'year', 'return_type', 'tax_period', 'object_id', 'taxpayer_name', 'sub_date']) xml_ns = {'': 'http://www.irs.gov/efile'} try: for item in items: xml_url = "https://s3.amazonaws.com/irs-form-990/{}_public.xml".format( item['object_id']) form_990 = load_990(xml_url) r_name, r_tax_period, r_return_id, r_return_type, r_filing_name, r_filing_url, r_filing_date_start, r_filing_date_end = None, None, None, None, None, None, None, None root = ElementTree.fromstring(form_990) return_header = root.find('ReturnHeader', xml_ns) filer_details = return_header.find('Filer', xml_ns) tax_year = return_header.find('TaxYr', xml_ns).text r_name = filer_details.find( 'BusinessName', xml_ns).find( 'BusinessNameLine1Txt', xml_ns).text r_return_id = item['id'] r_return_type = item['return_type'] r_tax_period = item['tax_period'] r_filing_name = "{} Form {} Filing".format(tax_year, item['return_type']) r_filing_url = "https://apps.irs.gov/pub/epostcard/cor/{}_{}_{}_{}{}.pdf".format( ein, item['tax_period'], item['return_type'], dateparser.parse(item['sub_date']).strftime('%Y%m%d'), item['id']) r_filing_date_start = return_header.find( 'TaxPeriodBeginDt', xml_ns).text r_filing_date_start = dateparser.parse( r_filing_date_start).strftime('%m-%d-%Y') r_filing_date_end = return_header.find( 'TaxPeriodEndDt', xml_ns).text r_filing_date_end = dateparser.parse( r_filing_date_end).strftime('%m-%d-%Y') row = pd.DataFrame([[ein, r_name, r_tax_period, r_return_id, r_return_type, r_filing_name, r_filing_url, r_filing_date_start, r_filing_date_end]], columns=columns) irs_returns = irs_returns.append(row) logger.info("Finished {}".format(ein)) except Exception as err: traceback.print_exc() logger.error(err) return irs_returns
def lookup_990s_by_ein(ein, ignore_tax_periods=[]): ein = ein.strip() columns = [ 'ein', 'name', 'tax_period', 'return_id', 'return_type', 'filing_name', 'filing_url', 'filing_date_start', 'filing_date_end' ] irs_returns = pd.DataFrame(columns=columns) if len(ein) < 9: # EIN has to be at least 9 characters return irs_returns else: logger.info("Loading {}'s 990 data".format(ein)) url = 'https://apps.irs.gov/app/eos/displayCopyOfReturns.do?dispatchMethod=displayCORInfo&CopyOfReturnId=1211739&ein={}&country=US&deductibility=all&dispatchMethod=searchCopyOfReturns&isDescending=false&city=&ein1={}&postDateFrom=&exemptTypeCode=al&submitName=Search&sortColumn=orgName&totalResults=1&names=&resultsPerPage=25&indexOfFirstRow=0&postDateTo=&state=All+States'.format( ein, ein) page = requests.get(url) soup = bs4.BeautifulSoup(page.text, 'html.parser') # with open("page.html", "w") as html_file: # html_file.write(str(soup)) rows = soup.find_all(id=re.compile(r"copyOfReturns\d*")) try: for row in rows: returns = row.find_all(['b', 'br', 'span', 'a']) # Convert the chunk of filing html into a simple list of values def flattened_details(tags): values = [] for tag in tags: if not isinstance(tag, bs4.NavigableString) and len( list(tag.children)) > 0: if isinstance(tag, bs4.Tag) and tag.name == 'a': values.append('https://apps.irs.gov{}'.format( tag.attrs['href'])) values += flattened_details(list(tag.children)) else: if isinstance(tag, bs4.NavigableString): value = str(tag).strip() else: value = tag.getText().strip() if value != "": values.append(value) return values details = flattened_details(returns) r_name, r_tax_period, r_return_id, r_return_type, r_filing_name, r_filing_url, r_filing_date_start, r_filing_date_end = None, None, None, None, None, None, None, None ii = 0 while ii < len(details): if details[ii].lower() == 'organization name:': r_name = details[ii + 1] ii += 1 continue elif r_tax_period is None and details[ii].lower( ) == 'tax period:': r_tax_period = details[ii + 1] ii += 1 continue elif details[ii].lower() == 'return id:': r_return_id = details[ii + 1] ii += 1 continue elif details[ii].lower() == 'copy of return:': r_filing_url = details[ii + 1] r_filing_name = details[ii + 2] ii += 2 continue elif r_return_type is None and details[ii].lower( ) == 'return type:': r_return_type = details[ii + 1] ii += 1 continue ii += 1 if len(ignore_tax_periods) > 0: if r_tax_period in ignore_tax_periods: logger.warn("Skipping {}'s Tax Period '{}'".format( ein, r_tax_period)) continue if r_filing_url is not None: logger.info("Fetching {}'s '{}' (ReturnId={})".format( ein, r_filing_name, r_return_id)) text_990 = load_990(r_filing_url) date_reg = r"((?:(?:\d{1,2}|j(?:an(?:uary)?|(?:u(?:ne|n|ly|l))))|(?:feb(?:ruary)?)|(?:mar(?:ch)?)|(?:may)|(?:a(?:pr(?:il)?|ug(?:ust)?))|(?:(?:sep(?:t)?|nov|dec)(?:em(?:ber)?)?)|(?:oct(?:ober|ob)?))).*?(\d{1,2}).*?(\d{2}\W\d{2}|(?:\d{4}|\d{2}[^\d\%]))" filing_date_beginning_regex = re.compile( "beg(?:i?n?n?i?n?g?)?.*?" + date_reg, flags=re.IGNORECASE | re.S) filing_date_ending_regex = re.compile( r"(?:^|\s){1}endi(?:n?g?)?.*?" + date_reg, flags=re.IGNORECASE | re.S) filing_date_beginning_match = re.search( filing_date_beginning_regex, text_990) if filing_date_beginning_match is not None: r_filing_date_start = "{}-{}-{}".format( filing_date_beginning_match.group(1).replace( ' ', '').strip(), filing_date_beginning_match.group(2).replace( ' ', '').strip(), filing_date_beginning_match.group(3).replace( ' ', '').strip()) r_filing_date_start = re.sub(r"\s", "", r_filing_date_start) r_filing_date_start = dateparser.parse( r_filing_date_start).strftime('%m-%d-%Y') filing_date_ending_match = re.search( filing_date_ending_regex, text_990) if filing_date_ending_match is not None: r_filing_date_end = "{}-{}-{}".format( filing_date_ending_match.group(1).replace( ' ', '').strip(), filing_date_ending_match.group(2).replace( ' ', '').strip(), filing_date_ending_match.group(3).replace( ' ', '').strip()) r_filing_date_end = re.sub(r"\s", "", r_filing_date_end) r_filing_date_end = dateparser.parse( r_filing_date_end).strftime('%m-%d-%Y') row = pd.DataFrame([[ ein, r_name, r_tax_period, r_return_id, r_return_type, r_filing_name, r_filing_url, r_filing_date_start, r_filing_date_end ]], columns=columns) irs_returns = irs_returns.append(row) logger.info("Finished {}".format(ein)) except Exception as err: traceback.print_exc() logger.error(err) irs_returns.ein = irs_returns.ein.astype(int) irs_returns.return_id = irs_returns.return_id.astype(int) return irs_returns