def handle(self, *args, **options): new_filings = 0 # old URL #rss_url = "http://query.nictusa.com/rss/newfilings.rss" # This was moved here, approximately 9/21/12 #rss_url = "http://fecapps.nictusa.com/rss/generate?preDefinedFilingType=ALL" rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL" my_logger.info('SCRAPE_DAILY_FILINGS - starting regular run') headers = {'User-Agent': USER_AGENT} data = None req = urllib2.Request(rss_url, data, headers) response = urllib2.urlopen(req) rssdata = response.read() #print rssdata results = parse_xml_from_text(rssdata) for result in results: filing_entered = enter_filing(result) if filing_entered: new_filings += 1 now = Filing_Scrape_Time.objects.create() my_logger.info("SCRAPE_DAILY_FILINGS - completing regular run--created %s new filings" % new_filings) set_update('scrape_electronic_filings')
def handle(self, *args, **options): new_filings = 0 # old URL #rss_url = "http://query.nictusa.com/rss/newfilings.rss" # This was moved here, approximately 9/21/12 #rss_url = "http://fecapps.nictusa.com/rss/generate?preDefinedFilingType=ALL" rss_url = "http://efilingapps.fec.gov/rss/generate?preDefinedFilingType=ALL" my_logger.info('SCRAPE_DAILY_FILINGS - starting regular run') headers = {'User-Agent': USER_AGENT} data = None req = urllib2.Request(rss_url, data, headers) response = urllib2.urlopen(req) rssdata = response.read() #print rssdata results = parse_xml_from_text(rssdata) for result in results: filing_entered = enter_filing(result) if filing_entered: new_filings += 1 now = Filing_Scrape_Time.objects.create() my_logger.info( "SCRAPE_DAILY_FILINGS - completing regular run--created %s new filings" % new_filings) set_update('scrape_electronic_filings')
def handle(self, *args, **options): conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) b = conn.get_bucket(AWS_STORAGE_BUCKET_NAME) for sked in ['e','b', 'a']: filename = "sked%s.csv" % sked local_skedfile = "%s/%s" % (CSV_EXPORT_DIR, filename) print "Dumping sked %s to %s" % (sked, local_skedfile) dump_all_sked(sked, local_skedfile) # need to gzip these gzip_cmd = "gzip -f %s" % (local_skedfile) filename_zipped = filename + ".gz" local_skedfile_zipped = local_skedfile + ".gz" # old style os.system just works - subprocess sucks. proc = os.system(gzip_cmd) s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_zipped) print "pushing %s to S3: bucket=%s path=%s" % (local_skedfile_zipped, AWS_STORAGE_BUCKET_NAME,s3_path) start = time.time() k = Key(b) k.key = s3_path k.set_contents_from_filename(local_skedfile_zipped, policy='public-read') elapsed_time = time.time() - start print "elapsed time for pushing to s3 is %s" % (elapsed_time) # if we didn't die, set the update time set_update(BULK_EXPORT_KEY)
def handle(self, *args, **options): for CYCLE in ACTIVE_CYCLES: filename = "candidates_%s.csv" % (CYCLE) webk_filename = "all_webk_%s.csv" % (CYCLE) contrib_filename = 'superpac_contribs_%s.csv' % (CYCLE) nonindiv_contrib_filename = 'nonindiv_nonpac_superpac_contribs_%s.csv' % ( CYCLE) local_file = "%s/%s" % (CSV_EXPORT_DIR, filename) local_webk_file = "%s/%s" % (CSV_EXPORT_DIR, webk_filename) local_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, contrib_filename) local_nonindiv_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, nonindiv_contrib_filename) if not dry_run: dump_big_non_indiv_contribs(local_nonindiv_contrib_file, CYCLE) write_all_candidates(local_file, CYCLE) write_all_webks(local_webk_file, CYCLE) dump_big_contribs(local_contrib_file, CYCLE) # need to gzip these gzip_cmd = "gzip -f %s %s %s %s" % (local_file, local_webk_file, local_contrib_file, local_nonindiv_contrib_file) filename_zipped = filename + ".gz" filename_webk_zipped = webk_filename + ".gz" filename_contrib_zipped = contrib_filename + ".gz" filename_nonindiv_contrib_zipped = nonindiv_contrib_filename + ".gz" local_file_zipped = local_file + ".gz" local_webk_file_zipped = local_webk_file + ".gz" local_contrib_file_zipped = local_contrib_file + ".gz" local_nonindiv_contrib_file_zipped = local_nonindiv_contrib_file + ".gz" # old style os.system just works - subprocess sucks. print "Gzipping with: %s" % gzip_cmd if not dry_run: proc = os.system(gzip_cmd) s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH, filename_zipped) webk_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH, filename_webk_zipped) contrib_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH, filename_contrib_zipped) nonindiv_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH, filename_nonindiv_contrib_zipped) if not dry_run: push_to_s3(local_file_zipped, AWS_STORAGE_BUCKET_NAME, s3_path) push_to_s3(local_webk_file_zipped, AWS_STORAGE_BUCKET_NAME, webk_s3_path) push_to_s3(local_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME, contrib_s3_path) push_to_s3(local_nonindiv_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME, nonindiv_s3_path) # if we didn't die, set the update time if not dry_run: set_update(SUMMARY_EXPORT_KEY)
def handle(self, *args, **options): for CYCLE in ACTIVE_CYCLES: filename = "candidates_%s.csv" % (CYCLE) webk_filename = "all_webk_%s.csv" % (CYCLE) contrib_filename = 'superpac_contribs_%s.csv' % (CYCLE) nonindiv_contrib_filename = 'nonindiv_nonpac_superpac_contribs_%s.csv' % (CYCLE) local_file = "%s/%s" % (CSV_EXPORT_DIR, filename) local_webk_file = "%s/%s" % (CSV_EXPORT_DIR, webk_filename) local_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, contrib_filename) local_nonindiv_contrib_file = "%s/%s" % (CSV_EXPORT_DIR, nonindiv_contrib_filename) if not dry_run: dump_big_non_indiv_contribs(local_nonindiv_contrib_file, CYCLE) write_all_candidates(local_file, CYCLE) write_all_webks(local_webk_file, CYCLE) dump_big_contribs(local_contrib_file, CYCLE) # need to gzip these gzip_cmd = "gzip -f %s %s %s %s" % (local_file, local_webk_file, local_contrib_file, local_nonindiv_contrib_file) filename_zipped = filename + ".gz" filename_webk_zipped = webk_filename + ".gz" filename_contrib_zipped = contrib_filename + ".gz" filename_nonindiv_contrib_zipped = nonindiv_contrib_filename + ".gz" local_file_zipped = local_file + ".gz" local_webk_file_zipped = local_webk_file + ".gz" local_contrib_file_zipped = local_contrib_file + ".gz" local_nonindiv_contrib_file_zipped = local_nonindiv_contrib_file + ".gz" # old style os.system just works - subprocess sucks. print "Gzipping with: %s" % gzip_cmd if not dry_run: proc = os.system(gzip_cmd) s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_zipped) webk_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_webk_zipped) contrib_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_contrib_zipped) nonindiv_s3_path = "%s/%s" % (AWS_BULK_EXPORT_PATH,filename_nonindiv_contrib_zipped) if not dry_run: push_to_s3(local_file_zipped, AWS_STORAGE_BUCKET_NAME, s3_path) push_to_s3(local_webk_file_zipped, AWS_STORAGE_BUCKET_NAME, webk_s3_path) push_to_s3(local_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME, contrib_s3_path) push_to_s3(local_nonindiv_contrib_file_zipped, AWS_STORAGE_BUCKET_NAME, nonindiv_s3_path) # if we didn't die, set the update time if not dry_run: set_update(SUMMARY_EXPORT_KEY)
def handle(self, *args, **options): highest_filing_number = new_filing.objects.all().order_by('-filing_number')[0].filing_number print "highest previously available filing number: %s" % (highest_filing_number) trial_file_number = highest_filing_number highest_available_file_number = highest_filing_number file_misses = 0 file_miss_threshold = 3 while True: trial_file_number += 1 location = FEC_DOWNLOAD % (trial_file_number) print location try: result = urllib2.urlopen(location) print "Found %s" % (location) try: new_filing.objects.get(filing_number = trial_file_number) except new_filing.DoesNotExist: now = timezone.now() thisobj = new_filing.objects.create( filing_number = trial_file_number, process_time = now, filed_date = get_date(now)) except urllib2.HTTPError: print "didn't find %s" % (location) file_misses += 1 if file_misses >= file_miss_threshold: break sleep(1) # set the update time. set_update('scrape_electronic_filings')
def handle(self, *args, **options): highest_filing_number = new_filing.objects.all().order_by( '-filing_number')[0].filing_number print "highest previously available filing number: %s" % ( highest_filing_number) trial_file_number = highest_filing_number highest_available_file_number = highest_filing_number file_misses = 0 file_miss_threshold = 3 while True: trial_file_number += 1 location = FEC_DOWNLOAD % (trial_file_number) print location try: result = urllib2.urlopen(location) print "Found %s" % (location) try: new_filing.objects.get(filing_number=trial_file_number) except new_filing.DoesNotExist: now = timezone.now() thisobj = new_filing.objects.create( filing_number=trial_file_number, process_time=now, filed_date=get_date(now)) except urllib2.HTTPError: print "didn't find %s" % (location) file_misses += 1 if file_misses >= file_miss_threshold: break sleep(1) # set the update time. set_update('scrape_electronic_filings')
def handle(self, *args, **options): print "load f1 filers" filename = "/%s/Form1Filer_%s.csv" % (two_digit_cycle, two_digit_cycle) filelocation = FTP_DATA_DIR + filename readfile(filelocation) set_update(COMMITTEES_SCRAPE_KEY)
def handle(self, *args, **options): print "Scraping the FEC press offices new committee page" scrape_page() set_update(COMMITTEES_SCRAPE_KEY)