Ejemplo n.º 1
0
 def handle(self, *args, **options):
     
     new_filings = 0
     # old URL
     #rss_url = "http://query.nictusa.com/rss/newfilings.rss"
     # This was moved here, approximately 9/21/12
     #rss_url = "http://fecapps.nictusa.com/rss/generate?preDefinedFilingType=ALL"
     rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL"
     
     my_logger.info('SCRAPE_DAILY_FILINGS - starting regular run')
     headers = {'User-Agent': USER_AGENT}   
     data = None       
     req = urllib2.Request(rss_url, data, headers)
     response = urllib2.urlopen(req)
     rssdata = response.read()
     
     #print rssdata
     results = parse_xml_from_text(rssdata)
     for result in results:
         filing_entered = enter_filing(result)
         if filing_entered:
             new_filings += 1
     
     now = Filing_Scrape_Time.objects.create()
     my_logger.info("SCRAPE_DAILY_FILINGS - completing regular run--created %s new filings" % new_filings)
     set_update('scrape_electronic_filings')
Ejemplo n.º 2
0
    def handle(self, *args, **options):

        new_filings = 0
        # old URL
        #rss_url = "http://query.nictusa.com/rss/newfilings.rss"
        # This was moved here, approximately 9/21/12
        #rss_url = "http://fecapps.nictusa.com/rss/generate?preDefinedFilingType=ALL"
        rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL"

        my_logger.info('SCRAPE_DAILY_FILINGS - starting regular run')
        headers = {'User-Agent': USER_AGENT}
        data = None
        req = urllib2.Request(rss_url, data, headers)
        response = urllib2.urlopen(req)
        rssdata = response.read()

        #print rssdata
        results = parse_xml_from_text(rssdata)
        for result in results:
            filing_entered = enter_filing(result)
            if filing_entered:
                new_filings += 1

        now = Filing_Scrape_Time.objects.create()
        my_logger.info(
            "SCRAPE_DAILY_FILINGS - completing regular run--created %s new filings"
            % new_filings)
        set_update('scrape_electronic_filings')
Ejemplo n.º 3
0
 def handle(self, *args, **options):
     
      
     
     conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
     b = conn.get_bucket(AWS_STORAGE_BUCKET_NAME)
     
     for sked in ['e','b', 'a']:
         filename = "sked%s.csv" % sked
         
         local_skedfile = "%s/%s" % (CSV_EXPORT_DIR, filename)
         print "Dumping sked %s to %s" % (sked, local_skedfile)
         dump_all_sked(sked, local_skedfile)
         
         # need to gzip these
         gzip_cmd = "gzip -f %s" % (local_skedfile)
         filename_zipped = filename + ".gz"
         local_skedfile_zipped = local_skedfile + ".gz"
         # old style os.system just works - subprocess sucks. 
         proc = os.system(gzip_cmd)
         
         s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_zipped)
         print "pushing %s to S3: bucket=%s path=%s" % (local_skedfile_zipped, AWS_STORAGE_BUCKET_NAME,s3_path)
         start = time.time()
         k = Key(b)
         k.key = s3_path
         k.set_contents_from_filename(local_skedfile_zipped, policy='public-read')
         elapsed_time = time.time() - start
         print "elapsed time for pushing to s3 is %s" % (elapsed_time)
         
     
     # if we didn't die, set the update time
     set_update(BULK_EXPORT_KEY)
         
Ejemplo n.º 4
0
    def handle(self, *args, **options):

        for CYCLE in ACTIVE_CYCLES:
            filename = "candidates_%s.csv" % (CYCLE)
            webk_filename = "all_webk_%s.csv" % (CYCLE)
            contrib_filename = 'superpac_contribs_%s.csv' % (CYCLE)
            nonindiv_contrib_filename = 'nonindiv_nonpac_superpac_contribs_%s.csv' % (
                CYCLE)

            local_file = "%s/%s" % (CSV_EXPORT_DIR, filename)
            local_webk_file = "%s/%s" % (CSV_EXPORT_DIR, webk_filename)
            local_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, contrib_filename)
            local_nonindiv_contrib_file = "%s/%s" % (CSV_EXPORT_DIR,
                                                     nonindiv_contrib_filename)

            if not dry_run:
                dump_big_non_indiv_contribs(local_nonindiv_contrib_file, CYCLE)
                write_all_candidates(local_file, CYCLE)
                write_all_webks(local_webk_file, CYCLE)
                dump_big_contribs(local_contrib_file, CYCLE)

            # need to gzip these
            gzip_cmd = "gzip -f %s %s %s %s" % (local_file, local_webk_file,
                                                local_contrib_file,
                                                local_nonindiv_contrib_file)
            filename_zipped = filename + ".gz"
            filename_webk_zipped = webk_filename + ".gz"
            filename_contrib_zipped = contrib_filename + ".gz"
            filename_nonindiv_contrib_zipped = nonindiv_contrib_filename + ".gz"

            local_file_zipped = local_file + ".gz"
            local_webk_file_zipped = local_webk_file + ".gz"
            local_contrib_file_zipped = local_contrib_file + ".gz"
            local_nonindiv_contrib_file_zipped = local_nonindiv_contrib_file + ".gz"

            # old style os.system just works - subprocess sucks.
            print "Gzipping with: %s" % gzip_cmd
            if not dry_run:
                proc = os.system(gzip_cmd)
            s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH, filename_zipped)
            webk_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,
                                      filename_webk_zipped)
            contrib_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,
                                         filename_contrib_zipped)
            nonindiv_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,
                                          filename_nonindiv_contrib_zipped)

            if not dry_run:
                push_to_s3(local_file_zipped, AWS_STORAGE_BUCKET_NAME, s3_path)
                push_to_s3(local_webk_file_zipped, AWS_STORAGE_BUCKET_NAME,
                           webk_s3_path)
                push_to_s3(local_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME,
                           contrib_s3_path)
                push_to_s3(local_nonindiv_contrib_file_zipped,
                           AWS_STORAGE_BUCKET_NAME, nonindiv_s3_path)

            # if we didn't die, set the update time
            if not dry_run:
                set_update(SUMMARY_EXPORT_KEY)
    def handle(self, *args, **options):
        
        for CYCLE in ACTIVE_CYCLES:
            filename = "candidates_%s.csv" % (CYCLE) 
            webk_filename = "all_webk_%s.csv" % (CYCLE) 
            contrib_filename = 'superpac_contribs_%s.csv' % (CYCLE) 
            nonindiv_contrib_filename = 'nonindiv_nonpac_superpac_contribs_%s.csv' % (CYCLE) 
        
            local_file = "%s/%s" % (CSV_EXPORT_DIR, filename)
            local_webk_file = "%s/%s" % (CSV_EXPORT_DIR, webk_filename)
            local_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, contrib_filename)
            local_nonindiv_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, nonindiv_contrib_filename)
        
            if not dry_run:
                dump_big_non_indiv_contribs(local_nonindiv_contrib_file, CYCLE)
                write_all_candidates(local_file, CYCLE)
                write_all_webks(local_webk_file, CYCLE)
                dump_big_contribs(local_contrib_file, CYCLE)
        
        
            # need to gzip these
            gzip_cmd = "gzip -f %s %s %s %s" % (local_file, local_webk_file, local_contrib_file, local_nonindiv_contrib_file)
            filename_zipped = filename + ".gz"
            filename_webk_zipped = webk_filename + ".gz"
            filename_contrib_zipped = contrib_filename + ".gz"
            filename_nonindiv_contrib_zipped = nonindiv_contrib_filename + ".gz"
        
            local_file_zipped = local_file + ".gz"
            local_webk_file_zipped = local_webk_file + ".gz"
            local_contrib_file_zipped = local_contrib_file + ".gz"
            local_nonindiv_contrib_file_zipped = local_nonindiv_contrib_file + ".gz"
        
        
            # old style os.system just works - subprocess sucks. 
            print "Gzipping with: %s" % gzip_cmd
            if not dry_run:
                proc = os.system(gzip_cmd)
            s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_zipped)
            webk_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_webk_zipped)
            contrib_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_contrib_zipped)
            nonindiv_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_nonindiv_contrib_zipped)
        
            if not dry_run:
                push_to_s3(local_file_zipped, AWS_STORAGE_BUCKET_NAME, s3_path)
                push_to_s3(local_webk_file_zipped, AWS_STORAGE_BUCKET_NAME, webk_s3_path)
                push_to_s3(local_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME, contrib_s3_path)
                push_to_s3(local_nonindiv_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME, nonindiv_s3_path)

        
            # if we didn't die, set the update time
            if not dry_run:
                set_update(SUMMARY_EXPORT_KEY)
Ejemplo n.º 6
0
    def handle(self, *args, **options):
        
        
        highest_filing_number = new_filing.objects.all().order_by('-filing_number')[0].filing_number
        print "highest previously available filing number: %s" % (highest_filing_number)
        trial_file_number = highest_filing_number
        highest_available_file_number = highest_filing_number
        file_misses = 0
        file_miss_threshold = 3
        
        while True:
            trial_file_number += 1 
            location = FEC_DOWNLOAD % (trial_file_number)
            print location
            try:
                result = urllib2.urlopen(location)
                print "Found %s" % (location)
                try:
                    new_filing.objects.get(filing_number = trial_file_number)
                except new_filing.DoesNotExist:
                    now = timezone.now()
                    thisobj = new_filing.objects.create(
                                filing_number = trial_file_number, 
                                process_time = now,
                                filed_date = get_date(now))
                                

            except urllib2.HTTPError:
                print "didn't find %s" % (location)
                file_misses += 1
                
            if file_misses >= file_miss_threshold:
                break
                
            sleep(1)
        
        # set the update time. 
        set_update('scrape_electronic_filings')
Ejemplo n.º 7
0
    def handle(self, *args, **options):

        highest_filing_number = new_filing.objects.all().order_by(
            '-filing_number')[0].filing_number
        print "highest previously available filing number: %s" % (
            highest_filing_number)
        trial_file_number = highest_filing_number
        highest_available_file_number = highest_filing_number
        file_misses = 0
        file_miss_threshold = 3

        while True:
            trial_file_number += 1
            location = FEC_DOWNLOAD % (trial_file_number)
            print location
            try:
                result = urllib2.urlopen(location)
                print "Found %s" % (location)
                try:
                    new_filing.objects.get(filing_number=trial_file_number)
                except new_filing.DoesNotExist:
                    now = timezone.now()
                    thisobj = new_filing.objects.create(
                        filing_number=trial_file_number,
                        process_time=now,
                        filed_date=get_date(now))

            except urllib2.HTTPError:
                print "didn't find %s" % (location)
                file_misses += 1

            if file_misses >= file_miss_threshold:
                break

            sleep(1)

        # set the update time.
        set_update('scrape_electronic_filings')
Ejemplo n.º 8
0
 def handle(self, *args, **options):
     print "load f1 filers"
     filename =  "/%s/Form1Filer_%s.csv" % (two_digit_cycle, two_digit_cycle)
     filelocation = FTP_DATA_DIR + filename
     readfile(filelocation)
     set_update(COMMITTEES_SCRAPE_KEY)
Ejemplo n.º 9
0
 def handle(self, *args, **options):
     print "Scraping the FEC press offices new committee page"
     scrape_page()
     set_update(COMMITTEES_SCRAPE_KEY)
Ejemplo n.º 10
0
 def handle(self, *args, **options):
     print "Scraping the FEC press offices new committee page"
     scrape_page()
     set_update(COMMITTEES_SCRAPE_KEY)
Ejemplo n.º 11
0
 def handle(self, *args, **options):
     print "load f1 filers"
     filename = "/%s/Form1Filer_%s.csv" % (two_digit_cycle, two_digit_cycle)
     filelocation = FTP_DATA_DIR + filename
     readfile(filelocation)
     set_update(COMMITTEES_SCRAPE_KEY)