def bill_search_history_crawler(session): # example full URL: # 'ftp://ftp.legis.state.tx.us/bills/84R/billhistory/house_bills/HB00001_HB00099/' bill_hist_url_unformatted = 'ftp://ftp.legis.state.tx.us/bills/' +\ '{0}/billhistory/{1}_bills/' chambers = ['senate', 'house'] for chamber in chambers: # Abbreviate chamber for chamber_origin data later ch_abbr = chamber[0].upper() # Get list of directory paths in bill_history chamber folder bill_hist_chamber_url = bill_hist_url_unformatted.format(session, chamber) response = str(keep_trying_ftpopen(bill_hist_chamber_url)) bill_hist_folder_urls = response.split('\\r\\n')[:-1] bill_hist_folder_urls = map(lambda x: bill_hist_chamber_url + x[-15:] + '/', bill_hist_folder_urls) # Grab bill_history data from each directory within bill_history path for bill_hist_folder_url in bill_hist_folder_urls: # Get bill_history filenames response = str(keep_trying_ftpopen(bill_hist_folder_url)) for line in response.split('\\r\\n')[:-1]: filename_regex = re.search(r'[S,H]B (\d+)\.xml$', line) filename = filename_regex.group(0) if not isfile(save_path + filename): # Get XML data response = keep_trying_ftpopen(bill_hist_folder_url + filename) with open(save_path + filename, 'w') as bill_xml_file: bill_xml_file.write(response.decode("utf-8")) # Get bill_text input = { 'session': session, 'chamber_origin': ch_abbr, 'number': int(filename_regex.group(1)) } bill_text, bill_text_filename = scrape_bill_text(input) if not isfile(save_path + bill_text_filename): with open(save_path + bill_text_filename, 'w') as bill_text_file: bill_text_file.write(bill_text.decode("utf-8"))
def create(request): from annotation_app.forms import BillForm form = BillForm(request.POST) if form.is_valid(): data = form.cleaned_data bill = Bill.objects.filter(chamber_origin = data['chamber_origin'], number = data['number']) # If you find bill in the database, it is the first element in QuerySet if bill: bill = bill[0] # If bill is not in the database, pull it from TLO website else: from annotation_app.helpers.bill_scrapers import scrape_bill_text bill_text = scrape_bill_text(data) if bill_text == None: return HttpResponseRedirect('/') bill = Bill() bill.session = data['session'] bill.chamber_origin = data['chamber_origin'] bill.number = data['number'] bill.text = bill_text from annotation_app.helpers.htmllogic import htmltext bill.text = htmltext(bill.text) bill.save() if 'format' in request.POST: return HttpResponse(serializers.serialize(request.POST['format'], [bill])) else: return HttpResponseRedirect('/bills/%sB%d/' % (bill.chamber_origin, bill.number)) else: return HttpResponseRedirect('/')