def reset_state(): turbotlib.save_var("page", 1) turbotlib.save_var("check_count", None) try: os.remove('%s/records.dump' % turbotlib.data_dir()) except: pass try: os.remove('%s/individuals.db' % turbotlib.data_dir()) except: pass
def process_page(url, page_number, discard_data=False): global last_view_state global last_validation global last_view_generator records = [] req = retrieve(url, "POST", generate_body(page_number)) last_view_state = urllib.quote(get_asp_resp_var(req.text, "__VIEWSTATE")) last_validation = urllib.quote( get_asp_resp_var(req.text, "__EVENTVALIDATION")) last_view_generator = urllib.quote( get_asp_resp_var(req.text, "__VIEWSTATEGENERATOR")) if discard_data: return req.text table = get_result_table(req.text) for tr in table.find_all('tr'): tds = tr.find_all('td') if len(tds) == 2: a = tds[0].find('a') firm_name = tds[0].text.strip() firm_information = process_details(url, a['href'], firm_name) details = firm_information['entries'] primary = { 'firm': firm_name, 'all_jurisdictions': tds[1].text.strip(), 'sample_date': datetime.datetime.now().isoformat(), 'source_url': url_start, 'historical_names': firm_information['historical_names'] } if len(details) > 0: for detail in details: categories = detail.pop('categories', []) if len(categories) > 0: for category in categories: records.append( json.dumps( dict(primary.items() + detail.items() + category.items()))) else: records.append( json.dumps(dict(primary.items() + detail.items()))) else: records.append(json.dumps(primary)) with open('%s/records.dump' % turbotlib.data_dir(), "a") as dump: for record in records: print record dump.write(record) dump.close() return req.text
def download_discount_housing(): """ download the community banK information and convert it """ download(DISCOUNT_HOUSES_DOC_LINK, DISCOUNT_HOUSE_LOCATION) subprocess.call(['libreoffice', '--headless', '--convert-to', 'html', DISCOUNT_HOUSE_LOCATION, '--outdir', turbotlib.data_dir()], stdout=open(os.devnull, 'wb') )
def download_community_banks(): """ download the community banK information and convert it """ download(COMMUNITY_BANK_DOC_LINK, COMMUNIY_BANK_LOCATION) subprocess.call(['libreoffice', '--headless', '--convert-to', 'html', COMMUNIY_BANK_LOCATION, '--outdir', turbotlib.data_dir()], stdout=open(os.devnull, 'wb') )
def process_page(url, page_number, discard_data=False): global last_view_state global last_validation global last_view_generator records = [] req = retrieve(url, "POST", generate_body(page_number)) last_view_state = urllib.quote(get_asp_resp_var(req.text, "__VIEWSTATE")) last_validation = urllib.quote(get_asp_resp_var(req.text, "__EVENTVALIDATION")) last_view_generator = urllib.quote(get_asp_resp_var(req.text, "__VIEWSTATEGENERATOR")) if discard_data: return req.text table = get_result_table(req.text) for tr in table.find_all('tr'): tds = tr.find_all('td') if len(tds) == 2: a = tds[0].find('a') firm_name = tds[0].text.strip() firm_information = process_details(url, a['href'], firm_name) details = firm_information['entries'] primary = {'firm': firm_name, 'all_jurisdictions': tds[1].text.strip(), 'sample_date': datetime.datetime.now().isoformat(), 'source_url': url_start, 'historical_names': firm_information['historical_names']} if len(details) > 0: for detail in details: categories = detail.pop('categories', []) if len(categories) > 0: for category in categories: records.append(json.dumps(dict(primary.items() + detail.items() + category.items()))) else: records.append(json.dumps(dict(primary.items() + detail.items()))) else: records.append(json.dumps(primary)) with open('%s/records.dump' % turbotlib.data_dir(), "a") as dump: for record in records: print record dump.write(record) dump.close() return req.text
def process_pages(url): # Attempt to resume if we can try: page_number = turbotlib.get_var("page") record_count = turbotlib.get_var("check_count") except KeyError: page_number = 1 record_count = None if page_number > 1: turbotlib.log("Resuming run from page {0}".format(page_number)) with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump: for record in dump: print record dump.close() # iterate over whole or remaining data set while record_count is None or (page_number * 100 - 100) < record_count: turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100), (page_number * 100))) # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp # However, not a problem and subsequent calls work as expected. response_text = process_page(url, 2 if page_number == 1 else page_number) # Ensure the number of records haven't changed during run check_count = get_record_count(response_text) turbotlib.save_var("check_count", check_count) if record_count is not None and record_count != check_count: reset_state() raise Exception( "The data set changed during parsing, we need a re-run.") else: record_count = check_count if not record_count > 0: raise Exception("The data set is empty.") page_number += 1 turbotlib.save_var("page", page_number) turbotlib.log("Run finished!") reset_state()
def process_pages(url): # Attempt to resume if we can try: page_number = turbotlib.get_var("page") record_count = turbotlib.get_var("check_count") except KeyError: page_number = 1 record_count = None if page_number > 1: turbotlib.log("Resuming run from page {0}".format(page_number)) with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump: for record in dump: print record dump.close() # iterate over whole or remaining data set while record_count is None or (page_number * 100 - 100) < record_count: turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100), (page_number * 100))) # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp # However, not a problem and subsequent calls work as expected. response_text = process_page(url, 2 if page_number == 1 else page_number) # Ensure the number of records haven't changed during run check_count = get_record_count(response_text) turbotlib.save_var("check_count", check_count) if record_count is not None and record_count != check_count: reset_state() raise Exception("The data set changed during parsing, we need a re-run.") else: record_count = check_count if not record_count > 0: raise Exception("The data set is empty.") page_number += 1 turbotlib.save_var("page", page_number) turbotlib.log("Run finished!") reset_state()
def extract_community_bank_data(): """ Extracts the data from the downloaded html""" with open("%s/comunity_banks.html" % turbotlib.data_dir()) as infile: html_content = infile.read() content = BeautifulSoup(html_content) table = content.find("table") for j, row in enumerate(table("tr")): # skip the headings if j == 0: continue data = {} for i, cell in enumerate(row("td")): strings = [string for string in cell.strings] value = clean(strings[1]) if i == 0: data['name'] = value elif i == 1: data['address'] = value data['type'] = 'Community Bank' data['sample_date'] = SAMPLE_DATE data['source_url'] = COMMUNITY_BANK_DOC_LINK print json.dumps(data)
reset_state() def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d # ---------------------------------------------------------------------------------------------------------------------- turbotlib.log("Starting run...") # create individuals cache usersDB = sqlite3.connect('%s/individuals.db' % turbotlib.data_dir()) usersDB.row_factory = dict_factory usersDB.execute( "CREATE TABLE IF NOT EXISTS individuals(jurisdiction, name, firm, terms, contact, categories)" ) usersDB.commit() turbotlib.log("Getting initial view state...") init_req = retrieve(url_start, "GET", "") document = BeautifulSoup(init_req.text) last_view_state = urllib.quote(document.find(id='__VIEWSTATE')['value']) last_validation = urllib.quote(document.find(id='__EVENTVALIDATION')['value']) last_view_generator = urllib.quote( document.find(id='__VIEWSTATEGENERATOR')['value'])
http://missions.opencorporates.com/missions/768 """ import requests import xlrd import json import datetime import turbotlib import subprocess import re import os from bs4 import BeautifulSoup DOCUMENT_LINK = "http://www.bsl.gov.sl/Directory_of_Financial_&_" \ + "Non-Bank_Financial_Institutions/" \ + "COMMERCIAL_BANKS_&_ADDRESSES.xls" SHEET_LOCATION = "%s/sheet.xls" % turbotlib.data_dir() SOURCES = ["http://www.bsl.gov.sl/ntl_lottery.html", "http://www.bsl.gov.sl/housing_fin.html", "http://www.bsl.gov.sl/insurance_cos.html", "http://www.bsl.gov.sl/savings_loans.html", "http://www.bsl.gov.sl/finance_houses.html" ] COMMUNITY_BANK_DOC_LINK = "http://www.bsl.gov.sl/Directory_of_Financial_&_Non-Bank_Financial_Institutions/COMMUNITY_BANKS_&_ADDRESSES.doc" COMMUNIY_BANK_LOCATION = "%s/comunity_banks.doc" % turbotlib.data_dir() DISCOUNT_HOUSES_DOC_LINK = "http://www.bsl.gov.sl/Directory_of_Financial_&_Non-Bank_Financial_Institutions/DISCOUNT_HOUSES_MFIs_&_MORTGAGE_&_SAVINGS_COMPANIES_&_ADDRESSES.doc" DISCOUNT_HOUSE_LOCATION = "%s/discount_houses.doc" % turbotlib.data_dir() SAMPLE_DATE = datetime.date.today().isoformat() def download_community_banks():
formatting = True source_url = base_url + link['href'] excel_file = requests.get(source_url).content excel_results = parse_excel(excel_file, formatting, source_url, source['category']) for result in excel_results: if ('name' in result): print(json.dumps(result)) #same sort of idea but for word files if (source['file'] == "word"): if ('.doc' in link['href']): #download the file to the turbot working directory source_url = base_url + link['href'] file_name = link['href'].split("/")[-1] word_location = turbotlib.data_dir() + "/" + file_name word_file = download(source_url, word_location) #find out what the html file will be called file_extension_start = word_location.rfind(".") file_basename = word_location[:file_extension_start] html_location = file_basename + ".html" #convert the file to html using libreoffice with open(os.devnull, "w") as fnull: subprocess.call(['libreoffice', '--headless', '--convert-to', 'html', word_location, '--outdir', turbotlib.data_dir()], stdout=fnull) print(os.path.isfile(html_location)) html_file = open(html_location, "r") html_document = BeautifulSoup(html_file) #now process the document and print it out
turbotlib.log("Run finished!") reset_state() def dict_factory(cursor, row): d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d # ---------------------------------------------------------------------------------------------------------------------- turbotlib.log("Starting run...") # create individuals cache usersDB = sqlite3.connect('%s/individuals.db' % turbotlib.data_dir()) usersDB.row_factory = dict_factory usersDB.execute("CREATE TABLE IF NOT EXISTS individuals(jurisdiction, name, firm, terms, contact, categories)") usersDB.commit() turbotlib.log("Getting initial view state...") init_req = retrieve(url_start, "GET", "") document = BeautifulSoup(init_req.text) last_view_state = urllib.quote(document.find(id='__VIEWSTATE')['value']) last_validation = urllib.quote(document.find(id='__EVENTVALIDATION')['value']) last_view_generator = urllib.quote(document.find(id='__VIEWSTATEGENERATOR')['value']) process_page(url_start, 1, True) # first request returns junk data, discard it process_pages(url_start) usersDB.close()