Beispiel #1
0
def reset_state():
    turbotlib.save_var("page", 1)
    turbotlib.save_var("check_count", None)

    try:
        os.remove('%s/records.dump' % turbotlib.data_dir())
    except:
        pass

    try:
        os.remove('%s/individuals.db' % turbotlib.data_dir())
    except:
        pass
def reset_state():
    turbotlib.save_var("page", 1)
    turbotlib.save_var("check_count", None)

    try:
        os.remove('%s/records.dump' % turbotlib.data_dir())
    except:
        pass

    try:
        os.remove('%s/individuals.db' % turbotlib.data_dir())
    except:
        pass
Beispiel #3
0
def process_page(url, page_number, discard_data=False):
    global last_view_state
    global last_validation
    global last_view_generator

    records = []

    req = retrieve(url, "POST", generate_body(page_number))
    last_view_state = urllib.quote(get_asp_resp_var(req.text, "__VIEWSTATE"))
    last_validation = urllib.quote(
        get_asp_resp_var(req.text, "__EVENTVALIDATION"))
    last_view_generator = urllib.quote(
        get_asp_resp_var(req.text, "__VIEWSTATEGENERATOR"))

    if discard_data:
        return req.text

    table = get_result_table(req.text)
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')

        if len(tds) == 2:
            a = tds[0].find('a')
            firm_name = tds[0].text.strip()
            firm_information = process_details(url, a['href'], firm_name)
            details = firm_information['entries']

            primary = {
                'firm': firm_name,
                'all_jurisdictions': tds[1].text.strip(),
                'sample_date': datetime.datetime.now().isoformat(),
                'source_url': url_start,
                'historical_names': firm_information['historical_names']
            }

            if len(details) > 0:
                for detail in details:
                    categories = detail.pop('categories', [])

                    if len(categories) > 0:
                        for category in categories:
                            records.append(
                                json.dumps(
                                    dict(primary.items() + detail.items() +
                                         category.items())))
                    else:
                        records.append(
                            json.dumps(dict(primary.items() + detail.items())))

            else:
                records.append(json.dumps(primary))

    with open('%s/records.dump' % turbotlib.data_dir(), "a") as dump:
        for record in records:
            print record
            dump.write(record)
        dump.close()

    return req.text
Beispiel #4
0
def download_discount_housing():
    """ download the community banK information and convert it """
    download(DISCOUNT_HOUSES_DOC_LINK, DISCOUNT_HOUSE_LOCATION)
    subprocess.call(['libreoffice',
                     '--headless',
                     '--convert-to',
                     'html',
                     DISCOUNT_HOUSE_LOCATION,
                     '--outdir',
                     turbotlib.data_dir()], stdout=open(os.devnull, 'wb')
                   )
Beispiel #5
0
def download_community_banks():
    """ download the community banK information and convert it """
    download(COMMUNITY_BANK_DOC_LINK, COMMUNIY_BANK_LOCATION)
    subprocess.call(['libreoffice',
                     '--headless',
                     '--convert-to',
                     'html',
                     COMMUNIY_BANK_LOCATION,
                     '--outdir',
                     turbotlib.data_dir()], stdout=open(os.devnull, 'wb')
                   )
def process_page(url, page_number, discard_data=False):
    global last_view_state
    global last_validation
    global last_view_generator

    records = []

    req = retrieve(url, "POST", generate_body(page_number))
    last_view_state     = urllib.quote(get_asp_resp_var(req.text, "__VIEWSTATE"))
    last_validation     = urllib.quote(get_asp_resp_var(req.text, "__EVENTVALIDATION"))
    last_view_generator = urllib.quote(get_asp_resp_var(req.text, "__VIEWSTATEGENERATOR"))

    if discard_data:
        return req.text

    table = get_result_table(req.text)
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')

        if len(tds) == 2:
            a = tds[0].find('a')
            firm_name = tds[0].text.strip()
            firm_information = process_details(url, a['href'], firm_name)
            details = firm_information['entries']

            primary = {'firm': firm_name,
                            'all_jurisdictions': tds[1].text.strip(),
                            'sample_date': datetime.datetime.now().isoformat(),
                            'source_url': url_start,
                            'historical_names': firm_information['historical_names']}

            if len(details) > 0:
                for detail in details:
                    categories = detail.pop('categories', [])

                    if len(categories) > 0:
                        for category in categories:
                            records.append(json.dumps(dict(primary.items() + detail.items() + category.items())))
                    else:
                        records.append(json.dumps(dict(primary.items() + detail.items())))

            else:
                records.append(json.dumps(primary))

    with open('%s/records.dump' % turbotlib.data_dir(), "a") as dump:
        for record in records:
            print record
            dump.write(record)
        dump.close()

    return req.text
Beispiel #7
0
def process_pages(url):

    # Attempt to resume if we can
    try:
        page_number = turbotlib.get_var("page")
        record_count = turbotlib.get_var("check_count")
    except KeyError:
        page_number = 1
        record_count = None

    if page_number > 1:
        turbotlib.log("Resuming run from page {0}".format(page_number))

        with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump:
            for record in dump:
                print record
            dump.close()

    # iterate over whole or remaining data set
    while record_count is None or (page_number * 100 - 100) < record_count:
        turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100),
                                                   (page_number * 100)))

        # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp
        # However, not a problem and subsequent calls work as expected.
        response_text = process_page(url,
                                     2 if page_number == 1 else page_number)

        # Ensure the number of records haven't changed during run
        check_count = get_record_count(response_text)
        turbotlib.save_var("check_count", check_count)
        if record_count is not None and record_count != check_count:
            reset_state()
            raise Exception(
                "The data set changed during parsing, we need a re-run.")
        else:
            record_count = check_count

        if not record_count > 0:
            raise Exception("The data set is empty.")

        page_number += 1
        turbotlib.save_var("page", page_number)

    turbotlib.log("Run finished!")
    reset_state()
def process_pages(url):

    # Attempt to resume if we can
    try:
        page_number = turbotlib.get_var("page")
        record_count = turbotlib.get_var("check_count")
    except KeyError:
        page_number = 1
        record_count = None

    if page_number > 1:
        turbotlib.log("Resuming run from page {0}".format(page_number))

        with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump:
            for record in dump:
                print record
            dump.close()

    # iterate over whole or remaining data set
    while record_count is None or (page_number * 100 - 100) < record_count:
        turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100), (page_number * 100)))

        # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp
        # However, not a problem and subsequent calls work as expected.
        response_text = process_page(url, 2 if page_number == 1 else page_number)

        # Ensure the number of records haven't changed during run
        check_count = get_record_count(response_text)
        turbotlib.save_var("check_count", check_count)
        if record_count is not None and record_count != check_count:
            reset_state()
            raise Exception("The data set changed during parsing, we need a re-run.")
        else:
            record_count = check_count

        if not record_count > 0:
            raise Exception("The data set is empty.")

        page_number += 1
        turbotlib.save_var("page", page_number)

    turbotlib.log("Run finished!")
    reset_state()
Beispiel #9
0
def extract_community_bank_data():
    """ Extracts the data from the downloaded html"""
    with open("%s/comunity_banks.html" % turbotlib.data_dir()) as infile:
        html_content = infile.read()
        content = BeautifulSoup(html_content)
        table = content.find("table")
    for j, row in enumerate(table("tr")):
        # skip the headings
        if j == 0:
            continue
        data = {}
        for i, cell in enumerate(row("td")):
            strings = [string for string in cell.strings]
            value = clean(strings[1])
            if i == 0:
                data['name'] = value
            elif i == 1:
                data['address'] = value
        data['type'] = 'Community Bank'
        data['sample_date'] = SAMPLE_DATE
        data['source_url'] = COMMUNITY_BANK_DOC_LINK
        print json.dumps(data)
Beispiel #10
0
    reset_state()


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


# ----------------------------------------------------------------------------------------------------------------------

turbotlib.log("Starting run...")

# create individuals cache
usersDB = sqlite3.connect('%s/individuals.db' % turbotlib.data_dir())
usersDB.row_factory = dict_factory
usersDB.execute(
    "CREATE TABLE IF NOT EXISTS individuals(jurisdiction, name, firm, terms, contact, categories)"
)
usersDB.commit()

turbotlib.log("Getting initial view state...")
init_req = retrieve(url_start, "GET", "")
document = BeautifulSoup(init_req.text)

last_view_state = urllib.quote(document.find(id='__VIEWSTATE')['value'])
last_validation = urllib.quote(document.find(id='__EVENTVALIDATION')['value'])
last_view_generator = urllib.quote(
    document.find(id='__VIEWSTATEGENERATOR')['value'])
Beispiel #11
0
http://missions.opencorporates.com/missions/768
"""
import requests
import xlrd
import json
import datetime
import turbotlib
import subprocess
import re
import os
from bs4 import BeautifulSoup

DOCUMENT_LINK = "http://www.bsl.gov.sl/Directory_of_Financial_&_" \
                + "Non-Bank_Financial_Institutions/" \
                + "COMMERCIAL_BANKS_&_ADDRESSES.xls"
SHEET_LOCATION = "%s/sheet.xls" % turbotlib.data_dir()
SOURCES = ["http://www.bsl.gov.sl/ntl_lottery.html",
           "http://www.bsl.gov.sl/housing_fin.html",
           "http://www.bsl.gov.sl/insurance_cos.html",
           "http://www.bsl.gov.sl/savings_loans.html",
           "http://www.bsl.gov.sl/finance_houses.html"
          ]
COMMUNITY_BANK_DOC_LINK = "http://www.bsl.gov.sl/Directory_of_Financial_&_Non-Bank_Financial_Institutions/COMMUNITY_BANKS_&_ADDRESSES.doc"
COMMUNIY_BANK_LOCATION = "%s/comunity_banks.doc" % turbotlib.data_dir()

DISCOUNT_HOUSES_DOC_LINK = "http://www.bsl.gov.sl/Directory_of_Financial_&_Non-Bank_Financial_Institutions/DISCOUNT_HOUSES_MFIs_&_MORTGAGE_&_SAVINGS_COMPANIES_&_ADDRESSES.doc"
DISCOUNT_HOUSE_LOCATION = "%s/discount_houses.doc" % turbotlib.data_dir()

SAMPLE_DATE = datetime.date.today().isoformat()

def download_community_banks():
Beispiel #12
0
					formatting = True

				source_url = base_url + link['href']
				excel_file = requests.get(source_url).content
				excel_results = parse_excel(excel_file, formatting, source_url, source['category'])
				for result in excel_results:
					if ('name' in result):
						print(json.dumps(result))

		#same sort of idea but for word files
		if (source['file'] == "word"):
			if ('.doc' in link['href']):
				#download the file to the turbot working directory
				source_url = base_url + link['href']
				file_name = link['href'].split("/")[-1]
				word_location = turbotlib.data_dir() + "/" + file_name
				word_file = download(source_url, word_location)

				#find out what the html file will be called
				file_extension_start = word_location.rfind(".")
				file_basename = word_location[:file_extension_start]
				html_location = file_basename + ".html"
				
				#convert the file to html using libreoffice
				with open(os.devnull, "w") as fnull:
					subprocess.call(['libreoffice', '--headless', '--convert-to', 'html', word_location, '--outdir', turbotlib.data_dir()], stdout=fnull)
				print(os.path.isfile(html_location))
				html_file = open(html_location, "r")
				html_document = BeautifulSoup(html_file)
				
				#now process the document and print it out
    turbotlib.log("Run finished!")
    reset_state()


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

# ----------------------------------------------------------------------------------------------------------------------

turbotlib.log("Starting run...")

# create individuals cache
usersDB = sqlite3.connect('%s/individuals.db' % turbotlib.data_dir())
usersDB.row_factory = dict_factory
usersDB.execute("CREATE TABLE IF NOT EXISTS individuals(jurisdiction, name, firm, terms, contact, categories)")
usersDB.commit()

turbotlib.log("Getting initial view state...")
init_req      = retrieve(url_start, "GET", "")
document = BeautifulSoup(init_req.text)

last_view_state     = urllib.quote(document.find(id='__VIEWSTATE')['value'])
last_validation     = urllib.quote(document.find(id='__EVENTVALIDATION')['value'])
last_view_generator = urllib.quote(document.find(id='__VIEWSTATEGENERATOR')['value'])

process_page(url_start, 1, True) # first request returns junk data, discard it
process_pages(url_start)
usersDB.close()