Exemple #1
0
    def handle(self, *args, **options):
        for year in options['year']:
            local_file_path = os.path.join(INDEX_DIRECTORY, "index_%s.csv" % year)

            # Try for a new file everytime
            # but don't overwrite 2014, because that file's got an error
            if year != 2014:
                irs_file_url = 'https://s3.amazonaws.com/irs-form-990/index_%s.csv' % year
                print('Downloading index_%s.csv...' % year)
                stream_download(irs_file_url, local_file_path)
                print('Done!')

            print("Entering xml submissions from %s" % local_file_path)
            fh = open(local_file_path, 'r')
            reader = csv.reader(fh)
            rows_to_enter = []

            # ignore header rows

            # python 2 idiom: headers = reader.next() <--- but this is a django 2 thing, so no python 2.X
            next(reader)
            count = 0
            for line in reader:
                try:
                    (return_id, filing_type, ein, tax_period, sub_date, taxpayer_name, return_type, dln, object_id) = line
                except ValueError as err:
                    print("Error with line: {line}".format(line=line))
                    if year == 2014:
                        print('Did you fix the 2014 index file? See the README for instructions.')
                    raise

                try:
                    obj = Filing.objects.get(object_id=object_id)
                except Filing.DoesNotExist:
                    new_sub = Filing(
                        return_id=return_id,
                        submission_year=year,
                        filing_type=filing_type,
                        ein=ein,
                        tax_period=tax_period,
                        sub_date=sub_date,
                        taxpayer_name=taxpayer_name,
                        return_type=return_type,
                        dln=dln,
                        object_id=object_id
                    )

                    rows_to_enter.append(new_sub)
                    count += 1

                if count % BATCH_SIZE == 0 and count > 0:
                    print("Committing %s total entered=%s" % (BATCH_SIZE, count))
                    Filing.objects.bulk_create(rows_to_enter)
                    print("commit complete")
                    rows_to_enter = []

            Filing.objects.bulk_create(rows_to_enter)
            print("Added %s new entries." % count)
    def handle(self, *args, **options):
        for year in options['year']:
            irs_file_url = 'https://s3.amazonaws.com/irs-form-990/index_%s.csv' % year
            irs_file_len = 0

            local_file_path = os.path.join(INDEX_DIRECTORY,
                                           "index_%s.csv" % year)

            if os.path.isfile(local_file_path):
                print('Found index_%s.csv...' % year)
            else:
                print('Downloading index_%s.csv...' % year)
                stream_download(irs_file_url, local_file_path)
                print('Done!')

            print("Entering xml submissions from %s" % local_file_path)
            fh = open(local_file_path, 'r')
            reader = csv.reader(fh)
            rows_to_enter = []

            # ignore header rows

            # python 2 idiom: headers = reader.next() <--- but this is a django 2 thing, so no python 2.X
            next(reader)
            count = 0
            for line in reader:
                (return_id, filing_type, ein, tax_period, sub_date,
                 taxpayer_name, return_type, dln, object_id) = line

                try:
                    obj = Filing.objects.get(object_id=object_id)
                except Filing.DoesNotExist:
                    new_sub = Filing(return_id=return_id,
                                     submission_year=year,
                                     filing_type=filing_type,
                                     ein=ein,
                                     tax_period=tax_period,
                                     sub_date=sub_date,
                                     taxpayer_name=taxpayer_name,
                                     return_type=return_type,
                                     dln=dln,
                                     object_id=object_id)

                    rows_to_enter.append(new_sub)
                    count += 1

                if count % BATCH_SIZE == 0 and count > 0:
                    print("Committing %s total entered=%s" %
                          (BATCH_SIZE, count))
                    Filing.objects.bulk_create(rows_to_enter)
                    print("commit complete")
                    rows_to_enter = []

            Filing.objects.bulk_create(rows_to_enter)
            print("Added %s new entries." % count)
Exemple #3
0
    def handle(self, *args, **options):
        for year in options['year']:
            irs_file_url = 'https://s3.amazonaws.com/irs-form-990/index_%s.csv' % year
            irs_file_len = 0

            force_download = options['download']
            local_file_path = os.path.join(INDEX_DIRECTORY,
                                           "index_%s.csv" % year)

            if not force_download:
                response = requests.head(irs_file_url)
                if response.status_code == 404:
                    print('index_%s.csv is not available for download.' % year)
                    continue
                else:
                    irs_file_len = int(response.headers['Content-Length'])

            # Verify index file has been downloaded.
            if not os.path.isfile(local_file_path) or force_download:
                print('Downloading index_%s.csv...' % year)
                stream_download(irs_file_url, local_file_path)
                print('Done!')

            local_file_len = os.path.getsize(local_file_path)
            if irs_file_len == local_file_len:
                print(
                    'File index_%s.csv has not changed since the last download.'
                    % year)
                if not options['enter']:
                    continue
            else:
                print('index_%s.csv has changed. Downloading updated file...' %
                      year)
                stream_download(irs_file_url, local_file_path)
                print('Done!')

            print("Entering xml submissions from %s" % local_file_path)
            fh = open(local_file_path, 'r')
            reader = csv.reader(fh)
            rows_to_enter = []

            # ignore header rows

            # python 2 idiom: headers = reader.next() <--- but this is a django 2 thing, so no python 2.X
            next(reader)
            count = 0
            for line in reader:
                (return_id, filing_type, ein, tax_period, sub_date,
                 taxpayer_name, return_type, dln, object_id) = line

                try:
                    obj = Filing.objects.get(object_id=object_id)
                except Filing.DoesNotExist:
                    new_sub = Filing(return_id=return_id,
                                     submission_year=year,
                                     filing_type=filing_type,
                                     ein=ein,
                                     tax_period=tax_period,
                                     sub_date=sub_date,
                                     taxpayer_name=taxpayer_name,
                                     return_type=return_type,
                                     dln=dln,
                                     object_id=object_id)

                    rows_to_enter.append(new_sub)
                    count += 1

                if count % BATCH_SIZE == 0 and count > 0:
                    print("Committing %s total entered=%s" %
                          (BATCH_SIZE, count))
                    Filing.objects.bulk_create(rows_to_enter)
                    print("commit complete")
                    rows_to_enter = []

            Filing.objects.bulk_create(rows_to_enter)
            print("Added %s new entries." % count)