def bill_search_history_crawler(session):
  # example full URL:
  # 'ftp://ftp.legis.state.tx.us/bills/84R/billhistory/house_bills/HB00001_HB00099/'
  bill_hist_url_unformatted = 'ftp://ftp.legis.state.tx.us/bills/' +\
    '{0}/billhistory/{1}_bills/'

  chambers = ['senate', 'house']
  for chamber in chambers:
    # Abbreviate chamber for chamber_origin data later
    ch_abbr = chamber[0].upper()

    # Get list of directory paths in bill_history chamber folder
    bill_hist_chamber_url = bill_hist_url_unformatted.format(session, chamber)
    response = str(keep_trying_ftpopen(bill_hist_chamber_url))

    bill_hist_folder_urls = response.split('\\r\\n')[:-1]
    bill_hist_folder_urls = map(lambda x: bill_hist_chamber_url + x[-15:] + '/',
      bill_hist_folder_urls)

    # Grab bill_history data from each directory within bill_history path
    for bill_hist_folder_url in bill_hist_folder_urls:

      # Get bill_history filenames
      response = str(keep_trying_ftpopen(bill_hist_folder_url))
      for line in response.split('\\r\\n')[:-1]:
        filename_regex = re.search(r'[S,H]B (\d+)\.xml$', line)
        filename = filename_regex.group(0)

        if not isfile(save_path + filename):

          # Get XML data
          response = keep_trying_ftpopen(bill_hist_folder_url + filename)
          with open(save_path + filename, 'w') as bill_xml_file:
            bill_xml_file.write(response.decode("utf-8"))

        # Get bill_text
        input = {
          'session': session,
          'chamber_origin': ch_abbr,
          'number': int(filename_regex.group(1))
        }
        bill_text, bill_text_filename = scrape_bill_text(input)

        if not isfile(save_path + bill_text_filename):
          with open(save_path + bill_text_filename, 'w') as bill_text_file:
            bill_text_file.write(bill_text.decode("utf-8"))
def create(request):
  from annotation_app.forms import BillForm

  form = BillForm(request.POST)

  if form.is_valid():
    data = form.cleaned_data
    bill = Bill.objects.filter(chamber_origin = data['chamber_origin'],
      number = data['number'])

    # If you find bill in the database, it is the first element in QuerySet
    if bill:
      bill = bill[0]
    # If bill is not in the database, pull it from TLO website
    else:
      from annotation_app.helpers.bill_scrapers import scrape_bill_text

      bill_text = scrape_bill_text(data)
      if bill_text == None:
        return HttpResponseRedirect('/')

      bill = Bill()
      bill.session = data['session']
      bill.chamber_origin = data['chamber_origin']
      bill.number = data['number']
      bill.text = bill_text

      from annotation_app.helpers.htmllogic import htmltext
      bill.text = htmltext(bill.text)
      bill.save()

    if 'format' in request.POST:
      return HttpResponse(serializers.serialize(request.POST['format'],
        [bill]))
    else:
      return HttpResponseRedirect('/bills/%sB%d/' % (bill.chamber_origin,
        bill.number))

  else:
    return HttpResponseRedirect('/')