if 'county' not in url.lower(): # set up the file name elec_name = re.search('\/.+\/(.+\.pdf)', urlparse(url).path).group(1).strip() file_name = f_path + elec_name.replace(' ', '') # check to see if we already have the file if path.isfile(file_name): print " Already downloaded {}.".format(file_name) else: print " Downloading {}.".format(file_name) # request the pdf sleep(3) response = requests_session.get(url) # save the file with open(file_name, 'w') as f: f.write(response.content) # whether we have the file or not, try creating a source_doc doc = get_or_create_source_doc( source = 'SoS' , name = elec_name , file_name = file_name , url = url , parent = past_results_page ) print 'fin.'
if '.txt' in i: print 'Getting data from {}'.format(i) # set up an election for each file election = Election( date = None , races = [] ) # this particular election was not available in pdf # had to copy the text from the 2001 Blue Book if i == 'AllRacesSpecialMarch2000SD5.txt': election.source_doc = get_or_create_source_doc( name = 'All Races Special March 2000 SD5' , file_name = f_path + i , url = 'http://s1.sos.mo.gov/cmsimages/bluebook/2001-2002/0711-0717.pdf#p715' , parent = None ) else: source_doc_file = f_path + i election.source_doc = Source_Doc.get(Source_Doc.file_name == source_doc_file.replace('txt', 'pdf')) # determine which type of election it is (based on file name) for elec_type in Election_Type.select(): if elec_type.name in i: election.election_type = elec_type # open the file with io.open(f_path + i, mode = 'r', encoding='UTF-8') as f: # declare a line reader so that we can reference line numbers (i.e., index position)
for bill in sb_q: # define the file path file_path = 'past_content/S/' + bill.session.name.replace(' ', '_') + '/co_sponsors/' print 'Getting co-sponsors for {0.bill_type.id} {0.number} from {0.session.year}'.format(bill) print bill.source_doc.url # define the file the_file = file_path+str(bill.bill_type.id)+'_'+ str(bill.number)+'_co_sponsors.htm' # get or create a source_doc record for the co-sponsor page source_doc = get_or_create_source_doc( file_name=the_file , name = '{0.bill_type.id} {0.number} co-sponsors'.format(bill) , session = bill.session , url = bill.co_sponsor_link , parent = bill.source_doc , chamber = 'S' ) content = None # load the content from the co-sponsor page while content == None: try: content = get_content(source_doc, requests_session) except requests.exceptions.ConnectionError as e: print e print ' Connection failed. Retrying...' requests_session = requests.session() except Exception as e:
current_session_links = [ 'http://house.mo.gov/member.aspx', 'http://house.mo.gov/billlist.aspx', 'http://www.senate.mo.gov/16info/SenateRoster.htm', 'http://www.senate.mo.gov/16info/BTS_Web/BillList.aspx?SessionType=R' ] for link in current_session_links: doc_data = {'parent': None, 'url': link, 'session': current_session} if 'house' in link.lower(): doc_data['chamber'] = 'H' elif 'senate' in link.lower(): doc_data['chamber'] = 'S' if 'member' in link.lower() or 'roster' in link.lower(): doc_data['name'] = '{} Roster'.format(doc_data['chamber']) elif 'bill' in link.lower(): doc_data['name'] = '{} bills'.format(doc_data['chamber']) if doc_data['chamber'] == 'H': doc_data['file_name'] = '{0}/{1}.html'.format( h_dir, doc_data['name'].replace(' ', '_')) elif doc_data['chamber'] == 'S': doc_data['file_name'] = '{0}/{1}.html'.format( s_dir, doc_data['name'].replace(' ', '_')) get_or_create_source_doc(**doc_data) print 'fin.'
for bill in sb_q: # define the file path file_path = "past_content/S/" + bill.session.name.replace(" ", "_") + "/co_sponsors/" print "Getting co-sponsors for {0.bill_type.id} {0.number} from {0.session.year}".format(bill) print bill.source_doc.url # define the file the_file = file_path + str(bill.bill_type.id) + "_" + str(bill.number) + "_co_sponsors.htm" # get or create a source_doc record for the co-sponsor page source_doc = get_or_create_source_doc( file_name=the_file, name="{0.bill_type.id} {0.number} co-sponsors".format(bill), session=bill.session, url=bill.co_sponsor_link, parent=bill.source_doc, chamber="S", ) content = None # load the content from the co-sponsor page while content == None: try: content = get_content(source_doc, requests_session) except requests.exceptions.ConnectionError as e: print e print " Connection failed. Retrying..." requests_session = requests.session() except Exception as e:
spl_txt = opt.text.split(' - ') # set up a new election election = Election( name = opt.text.strip() , election_date = spl_txt[-1].strip() , opt_value = opt['value'] , races = [] ) # get or create the source doc for the election election.source_doc = get_or_create_source_doc( source = 'SoS' , name = election.name , file_name = 'source_docs/SoS/election_results/html/{}.html'.format(election.name.replace('-', '').replace(',', '').replace(' ', '_').replace('__', '_')) , url = 'http://enrarchives.sos.mo.gov/enrnet/Default.aspx?eid={}'.format(election.opt_value) , parent = None ) # check each election type... for elec_type in Election_Type.select(): if elec_type.name in spl_txt[0]: # then set this attribute election.election_type = elec_type # if it's a general election... if election.election_type.name == 'General': # assume it's for the assembly starting next year election.assembly = Assembly.get(start_year = int(re.search('\d{4}', election.election_date).group()) + 1) # if it's a special election...
, 'http://www.senate.mo.gov/16info/SenateRoster.htm' , 'http://www.senate.mo.gov/16info/BTS_Web/BillList.aspx?SessionType=R' ] for link in current_session_links: doc_data = { 'parent': None , 'url': link , 'session': current_session } if 'house' in link.lower(): doc_data['chamber'] = 'H' elif 'senate' in link.lower(): doc_data['chamber'] = 'S' if 'member' in link.lower() or 'roster' in link.lower(): doc_data['name'] = '{} Roster'.format(doc_data['chamber']) elif 'bill' in link.lower(): doc_data['name'] = '{} bills'.format(doc_data['chamber']) if doc_data['chamber'] == 'H': doc_data['file_name'] = '{0}/{1}.html'.format(h_dir, doc_data['name'].replace(' ', '_')) elif doc_data['chamber'] == 'S': doc_data['file_name'] = '{0}/{1}.html'.format(s_dir, doc_data['name'].replace(' ', '_')) get_or_create_source_doc(**doc_data) print 'fin.'