def get_data_about_callsign(callsign): """ Hit the FCC api to fill in details about a station. """ # sometimes callsigns include -TV... json_api = "https://data.fcc.gov/mediabureau/v01/tv/facility/search/%s.json" % callsign try: response = read_url(json_api) except urllib2.HTTPError: print "No data for callsign %s. The FCC API could be down." % ( callsign) return None response_read = json.loads(response) station_list = None try: station_list = response_read['results']['searchList'][0][ 'facilityList'] if len(station_list) > 1: print "Multiple stations found for callsign=%s" % (callsign) return None except IndexError: print "Couldn't find any information about %s from the FCC's api" % ( callsign) return None try: return station_list[0] except IndexError: print "Empty station list %s" % (callsign)
def get_data_about_callsign(callsign): """ Hit the FCC api to fill in details about a station. """ # sometimes callsigns include -TV... json_api = "https://data.fcc.gov/mediabureau/v01/tv/facility/search/%s.json" % callsign try: response = read_url(json_api) except urllib2.HTTPError: print "No data for callsign %s. The FCC API could be down." % (callsign) return None response_read = json.loads(response) station_list = None try: station_list = response_read['results']['searchList'][0]['facilityList'] if len(station_list) > 1: print "Multiple stations found for callsign=%s" % (callsign) return None except IndexError: print "Couldn't find any information about %s from the FCC's api" % (callsign) return None try: return station_list[0] except IndexError: print "Empty station list %s" % (callsign)
def handle(self, *args, **options): pdfs_to_backup = PDF_File.objects.filter(local_file_path__isnull=True).exclude(not_at_fcc=True).values('id') num_to_process = len(pdfs_to_backup) print "Processing %s files" % num_to_process count = 0 for this_pdf_id in pdfs_to_backup: this_pdf = PDF_File.objects.get(pk=this_pdf_id['id']) if this_pdf.local_file_path or this_pdf.not_at_fcc: print "already entered!" continue count += 1 pdf_url = this_pdf.raw_url tempfile_name = urllib2.unquote(urlparse(pdf_url).path) tempfile_name = tempfile_name.lstrip('/') tempfile_name_fixed = tempfile_name.replace("/", "%%") if count%3 == 0: print "Processed %s" % count try: page = read_url(pdf_url) except urllib2.HTTPError: print "Couldn't get file %s" % (pdf_url) this_pdf.not_at_fcc=True this_pdf.missing_as_of_date=datetime.now() this_pdf.save() continue #print "read the pdf" tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed try: tempfile = open(tempfile_full, "wb") tempfile.write(page) this_pdf.local_file_path = tempfile_name_fixed this_pdf.save() tempfile.close() #print "wrote the pdf to %s" % (tempfile_full) except: # sometimes the file names are too long for the system, which sucks print "Something went wrong with %s" % (tempfile_full) sleep(0.5)
def handle(self, *args, **options): pdfs_to_backup = PDF_File.objects.filter( local_file_path__isnull=True).exclude(not_at_fcc=True).values('id') num_to_process = len(pdfs_to_backup) print "Processing %s files" % num_to_process count = 0 for this_pdf_id in pdfs_to_backup: this_pdf = PDF_File.objects.get(pk=this_pdf_id['id']) if this_pdf.local_file_path or this_pdf.not_at_fcc: print "already entered!" continue count += 1 pdf_url = this_pdf.raw_url tempfile_name = urllib2.unquote(urlparse(pdf_url).path) tempfile_name = tempfile_name.lstrip('/') tempfile_name_fixed = tempfile_name.replace("/", "%%") if count % 3 == 0: print "Processed %s" % count try: page = read_url(pdf_url) except urllib2.HTTPError: print "Couldn't get file %s" % (pdf_url) this_pdf.not_at_fcc = True this_pdf.missing_as_of_date = datetime.now() this_pdf.save() continue #print "read the pdf" tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed try: tempfile = open(tempfile_full, "wb") tempfile.write(page) this_pdf.local_file_path = tempfile_name_fixed this_pdf.save() tempfile.close() #print "wrote the pdf to %s" % (tempfile_full) except: # sometimes the file names are too long for the system, which sucks print "Something went wrong with %s" % (tempfile_full) sleep(0.5)
def make_ad_buy_from_pdf_file(pdf_file): pdf_url = pdf_file.raw_url auser = User.objects.all()[0] tempfile_name = urllib2.unquote(urlparse(pdf_url).path) tempfile_name = tempfile_name.lstrip('/') tempfile_name_fixed = tempfile_name.replace("/", "%%") print "temp name is %s" % (tempfile_name_fixed) tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed page = read_url(pdf_url) print "read the pdf" tempfile = open(tempfile_full, "wb") tempfile.write(page) tempfile.close() print "wrote the pdf" file = open(tempfile_full) djangofile = File(file) print "creating doc" d = Document(title=tempfile_name, description="From the FCC's political files", user=auser, access_level='public') d.file.save('new', djangofile) print "saved via local" d.connect_dc_doc() d.save() print "save 2" pol_buy = PoliticalBuy(documentcloud_doc=d) pol_buy.is_FCC_doc = True pol_buy.related_FCC_file = pdf_file pol_buy.save(auser) if pdf_file.folder.broadcaster: pol_buy.broadcasters.add(pdf_file.folder.broadcaster) pol_buy.save(auser) # # Record that this file has been uploaded. pdf_file.in_document_cloud = True pdf_file.save() return True
def make_ad_buy_from_pdf_file(pdf_file): pdf_url = pdf_file.raw_url auser = User.objects.all()[0] tempfile_name = urllib2.unquote(urlparse(pdf_url).path) tempfile_name = tempfile_name.lstrip('/') tempfile_name_fixed = tempfile_name.replace("/", "%%") print "temp name is %s" % (tempfile_name_fixed) tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed page = read_url(pdf_url) print "read the pdf" tempfile = open(tempfile_full, "wb") tempfile.write(page) tempfile.close() print "wrote the pdf" file = open(tempfile_full) djangofile = File(file) print "creating doc" d = Document(title=tempfile_name, description="From the FCC's political files", user=auser, access_level='public') d.file.save('new', djangofile) print "saved via local" d.connect_dc_doc() d.save() print "save 2" pol_buy = PoliticalBuy(documentcloud_doc=d) pol_buy.is_FCC_doc= True pol_buy.related_FCC_file = pdf_file pol_buy.save(auser) if pdf_file.folder.broadcaster: pol_buy.broadcasters.add(pdf_file.folder.broadcaster) pol_buy.save(auser) # # Record that this file has been uploaded. pdf_file.in_document_cloud = True pdf_file.save() return True