Python read_url Beispiele, scraper.utils.read_url Python Beispiele

Beispiel #1

0

Datei anzeigen

def get_data_about_callsign(callsign):
    """ Hit the FCC api to fill in details about a station. """
    # sometimes callsigns include -TV...

    json_api = "https://data.fcc.gov/mediabureau/v01/tv/facility/search/%s.json" % callsign
    try:
        response = read_url(json_api)
    except urllib2.HTTPError:
        print "No data for callsign %s. The FCC API could be down." % (
            callsign)
        return None

    response_read = json.loads(response)
    station_list = None
    try:
        station_list = response_read['results']['searchList'][0][
            'facilityList']
        if len(station_list) > 1:
            print "Multiple stations found for callsign=%s" % (callsign)
            return None

    except IndexError:
        print "Couldn't find any information about %s from the FCC's api" % (
            callsign)
        return None

    try:
        return station_list[0]
    except IndexError:
        print "Empty station list %s" % (callsign)

Beispiel #2

0

Datei anzeigen

Datei: reset_station_fcc_api_data.py Projekt: dwillis/fcc_political_ads

def get_data_about_callsign(callsign):
    """ Hit the FCC api to fill in details about a station. """
    # sometimes callsigns include -TV... 
    
    json_api = "https://data.fcc.gov/mediabureau/v01/tv/facility/search/%s.json" % callsign
    try:
        response = read_url(json_api)
    except urllib2.HTTPError:
        print "No data for callsign %s. The FCC API could be down." % (callsign)
        return None 
    
    response_read = json.loads(response)
    station_list = None
    try:
        station_list = response_read['results']['searchList'][0]['facilityList']
        if len(station_list) > 1:
            print "Multiple stations found for callsign=%s" % (callsign)
            return None
            
    except IndexError:
        print "Couldn't find any information about %s from the FCC's api" % (callsign)
        return None
    
    try:
        return station_list[0]
    except IndexError:
        print "Empty station list %s" % (callsign)

Beispiel #3

0

Datei anzeigen

Datei: backup_local.py Projekt: dwillis/fcc_political_ads

 def handle(self, *args, **options):
     pdfs_to_backup = PDF_File.objects.filter(local_file_path__isnull=True).exclude(not_at_fcc=True).values('id')
     
     num_to_process = len(pdfs_to_backup)
     print "Processing %s files" % num_to_process
     count = 0 
     
     for this_pdf_id in pdfs_to_backup:
         this_pdf = PDF_File.objects.get(pk=this_pdf_id['id'])
         
         if this_pdf.local_file_path or this_pdf.not_at_fcc:
             print "already entered!"
             continue
         
         count += 1
         pdf_url = this_pdf.raw_url
         
         tempfile_name =  urllib2.unquote(urlparse(pdf_url).path)
         tempfile_name = tempfile_name.lstrip('/')
         tempfile_name_fixed = tempfile_name.replace("/", "%%")
         if count%3 == 0: 
             print "Processed %s" % count
         
         try:
             page = read_url(pdf_url)
         except urllib2.HTTPError:
             print "Couldn't get file %s" % (pdf_url)
             this_pdf.not_at_fcc=True
             this_pdf.missing_as_of_date=datetime.now()
             this_pdf.save()
             continue
         
         #print "read the pdf"
         
         tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed
         try:
             tempfile = open(tempfile_full, "wb")
             tempfile.write(page)
             this_pdf.local_file_path = tempfile_name_fixed
             this_pdf.save()
             tempfile.close()
             #print "wrote the pdf to %s" % (tempfile_full)
         except:
             # sometimes the file names are too long for the system, which sucks
             print "Something went wrong with %s" % (tempfile_full)
             
         
         sleep(0.5)

Beispiel #4

0

Datei anzeigen

Datei: backup_local.py Projekt: dcloud/fcc_political_ads

    def handle(self, *args, **options):
        pdfs_to_backup = PDF_File.objects.filter(
            local_file_path__isnull=True).exclude(not_at_fcc=True).values('id')

        num_to_process = len(pdfs_to_backup)
        print "Processing %s files" % num_to_process
        count = 0

        for this_pdf_id in pdfs_to_backup:
            this_pdf = PDF_File.objects.get(pk=this_pdf_id['id'])

            if this_pdf.local_file_path or this_pdf.not_at_fcc:
                print "already entered!"
                continue

            count += 1
            pdf_url = this_pdf.raw_url

            tempfile_name = urllib2.unquote(urlparse(pdf_url).path)
            tempfile_name = tempfile_name.lstrip('/')
            tempfile_name_fixed = tempfile_name.replace("/", "%%")
            if count % 3 == 0:
                print "Processed %s" % count

            try:
                page = read_url(pdf_url)
            except urllib2.HTTPError:
                print "Couldn't get file %s" % (pdf_url)
                this_pdf.not_at_fcc = True
                this_pdf.missing_as_of_date = datetime.now()
                this_pdf.save()
                continue

            #print "read the pdf"

            tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed
            try:
                tempfile = open(tempfile_full, "wb")
                tempfile.write(page)
                this_pdf.local_file_path = tempfile_name_fixed
                this_pdf.save()
                tempfile.close()
                #print "wrote the pdf to %s" % (tempfile_full)
            except:
                # sometimes the file names are too long for the system, which sucks
                print "Something went wrong with %s" % (tempfile_full)

            sleep(0.5)

Beispiel #5

0

Datei anzeigen

Datei: put_pdfs.py Projekt: dcloud/fcc_political_ads

def make_ad_buy_from_pdf_file(pdf_file):

    pdf_url = pdf_file.raw_url
    auser = User.objects.all()[0]
    tempfile_name = urllib2.unquote(urlparse(pdf_url).path)
    tempfile_name = tempfile_name.lstrip('/')
    tempfile_name_fixed = tempfile_name.replace("/", "%%")
    print "temp name is %s" % (tempfile_name_fixed)
    tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed
    page = read_url(pdf_url)
    print "read the pdf"
    tempfile = open(tempfile_full, "wb")
    tempfile.write(page)
    tempfile.close()
    print "wrote the pdf"

    file = open(tempfile_full)
    djangofile = File(file)

    print "creating doc"
    d = Document(title=tempfile_name,
                 description="From the FCC's political files",
                 user=auser,
                 access_level='public')

    d.file.save('new', djangofile)
    print "saved via local"
    d.connect_dc_doc()
    d.save()

    print "save 2"

    pol_buy = PoliticalBuy(documentcloud_doc=d)
    pol_buy.is_FCC_doc = True
    pol_buy.related_FCC_file = pdf_file
    pol_buy.save(auser)

    if pdf_file.folder.broadcaster:
        pol_buy.broadcasters.add(pdf_file.folder.broadcaster)
        pol_buy.save(auser)

    #
    # Record that this file has been uploaded.
    pdf_file.in_document_cloud = True
    pdf_file.save()
    return True

Beispiel #6

0

Datei anzeigen

Datei: put_pdfs.py Projekt: dwillis/fcc_political_ads

def make_ad_buy_from_pdf_file(pdf_file):
    
    pdf_url = pdf_file.raw_url
    auser = User.objects.all()[0]
    tempfile_name =  urllib2.unquote(urlparse(pdf_url).path)
    tempfile_name = tempfile_name.lstrip('/')
    tempfile_name_fixed = tempfile_name.replace("/", "%%")
    print "temp name is %s" % (tempfile_name_fixed)
    tempfile_full = SCRAPER_LOCAL_DOC_DIR + "/" + tempfile_name_fixed
    page = read_url(pdf_url)
    print "read the pdf"
    tempfile = open(tempfile_full, "wb")
    tempfile.write(page)
    tempfile.close()
    print "wrote the pdf"
    
    file = open(tempfile_full)
    djangofile = File(file)

    print "creating doc"
    d = Document(title=tempfile_name, description="From the FCC's political files", user=auser, access_level='public')

    d.file.save('new', djangofile)
    print "saved via local"
    d.connect_dc_doc()
    d.save()

    print "save 2"

    pol_buy = PoliticalBuy(documentcloud_doc=d)
    pol_buy.is_FCC_doc= True
    pol_buy.related_FCC_file = pdf_file
    pol_buy.save(auser)
    
    if pdf_file.folder.broadcaster:
        pol_buy.broadcasters.add(pdf_file.folder.broadcaster)
        pol_buy.save(auser)
        
    # 
    # Record that this file has been uploaded. 
    pdf_file.in_document_cloud = True
    pdf_file.save()
    return True