Example #1
0
def find_daily_links(soup, last_seen_ref):
    '''
        Get the list of daily links, there should be the current and previous month of available links.
        There are older ones available in zip format by month.
    '''
    turbotlib.log('Getting daily links')
    re_target_pattern = re.compile(r'.*FAC(?P<date>[0-9]{4}).HTML', re.IGNORECASE)
    re_year_pattern = re.compile(r'[a-z]+ ([0-9]{4})$', re.IGNORECASE)
    day_links = []
    for table in soup.find_all('table', {'class': 'telerik-reTable-1'}):
        table_header = table.find_next('td')
        try:
            # Blah, a nasty hack to get the year since it looks like they reuse URLs yearly
            year = re.match(re_year_pattern, table_header.string).groups()[0]
        except AttributeError:
            # This must be the day of week table
            continue
        for link in table.find_all('a'):
            href = link.get('href')
            interesting_link = re.match(re_target_pattern, href)
            if interesting_link:
                date_ref = year + interesting_link.groups()[0]
                if date_ref > last_seen_ref:
                    day_links.append((date_ref, href))

    turbotlib.log('There are %s links to check' % len(day_links))
    return day_links
def retrieve(url, method, data, attempt=1):
    response = None
    connection_exception = False
    headers = {"X-MicrosoftAjax": "Delta=true",
               "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
               "Accept": "*/*",
               "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
                             "(KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
               "Cache-Control": "no-cache",
               "Pragma": "no-cache"}

    try:
        req = requests.Request(method, url, data=data, headers=headers)
        prepared = req.prepare()
        response = session.send(prepared)

    except requests.exceptions.RequestException:
        connection_exception = True

    if (connection_exception or response.status_code != requests.codes.ok) and attempt <= 5:
        turbotlib.log("There was a failure reaching or understanding the host, waiting and retrying...")

        if response is not None and response.text is not None:
            turbotlib.log("Failure was: " + response.text)

        time.sleep(attempt * 5)
        return retrieve(url, method, data, attempt + 1)

    return response
Example #3
0
def get_soup(url, session=None):
    turbotlib.log('Fetching %s' % url)
    if not session:
        session = requests.Session()
    response = session.get(url)
    html = response.content
    return BeautifulSoup(html)
Example #4
0
def get_soup(url, session=None):
    if not session:
        session = requests.Session()
    turbotlib.log('Getting soup for %s' % url)
    response = session.get(url)
    html = response.content
    return BeautifulSoup(html)
def get_registered_individuals(url, control_href, view_state, firm_jurisdiction, firm_name):
    return_array = []
    turbotlib.log("Retrieving individuals for current or historical firm: " + firm_name + " in: " + firm_jurisdiction)

    control_id = urllib.quote(control_href.replace("javascript:__doPostBack('", '').replace("','')", ''))
    individuals_page_req = retrieve(url, "POST", generate_body_control(control_id, view_state))

    if "Your search returned no records, please try searching again" in individuals_page_req.text:
        return []

    num_individuals = get_record_count(individuals_page_req.text)
    processed_individuals = 0
    last_processed_individuals = 0
    ind_page = 1

    while True:
        individuals_view_state = {'view'      : urllib.quote(get_asp_resp_var(individuals_page_req.text, "__VIEWSTATE")),
                                  'validation': urllib.quote(get_asp_resp_var(individuals_page_req.text, "__EVENTVALIDATION")),
                                  'generator' : urllib.quote(get_asp_resp_var(individuals_page_req.text, "__VIEWSTATEGENERATOR"))}

        individual_links = BeautifulSoup(individuals_page_req.text).select('tr > td > a')
        for link in individual_links:
            try:
                if "lbtnIndDetail" not in link['href']:
                    continue
            except:
                continue

            processed_individuals += 1

            name = link.text.strip()
            individual_dict = get_individual(name, firm_jurisdiction, firm_name)

            if individual_dict is None:
                get_and_store_individuals_for_firm(link, url, individuals_view_state, name)

                individual_dict = get_individual(name, firm_jurisdiction, firm_name)
                if individual_dict is not None:
                    return_array.append(individual_dict)

            else:
                return_array.append(individual_dict)

        if processed_individuals < num_individuals:
            if last_processed_individuals == processed_individuals:
                turbotlib.log('Warning: broke out of possible infinite loop trying to retrieve all individuals for firm.')
                break

            ind_page += 1
            control_id = urllib.quote('ctl00$bodyContent$lbtnPager{0}'.format(ind_page))
            individuals_page_req = retrieve(url, "POST", generate_body_control(control_id, individuals_view_state))

            last_processed_individuals = processed_individuals
        else:
            break

    return return_array
Example #6
0
def unwrap(response, identifier):
	try:
		r1 = getChunk(response, "<fragment><![CDATA[", identifier)
	except:
		turbotlib.log("Didn't find fragment")
		return None
	try: 
		r2 = getChunk(r1, "]]></fragment>",identifier)
		return r2
	except:
		turbotlib.log("Didn't find fragment")
		return None
Example #7
0
def date_formatter(date):
    if len(date) <= 0 or date is None:
        return None

    try:
        time = datetime.strptime(date, "%B %d, %Y").isoformat()[:-9]
        if len(time) <= 1:
            turbotlib.log("Failure parsing date: " + date)
            return None

        return time
    except:
        turbotlib.log("Failure parsing date: " + date)
        return None
Example #8
0
def run_scraper():
    "Initialize and run"

    links = CompanyLinks(
        "%s%s%s" % ("https://www.og.decc.gov.uk/", "eng/fox/decc/PED301X/",
                    "companyBlocksNav"))
    for i, j in enumerate(links.get_clean_links()):
        info = CompanyInfo(j)
        try:
            turbotlib.log("progress: %s" % i)
            for record in info.scrape():
                print json.dumps(record)
        except AttributeError as e:
            ## Uncomment the following to see empty entries
            ## that cause failures
            #print "Fails at %s as %s" %(i, e)
            pass
def main():
    """ Scrape licensed mortgage lenders data from extranet.dfi.in.gov """
    turbotlib.log("Starting run...")  # Optional debug logging
    source_url = 'http://extranet.dfi.in.gov/dfidb/mortgage.aspx'
    r = requests.get(source_url)

    etree = lxml.html.fromstring(r.content)
    column_names = get_column_names(etree)
    assert len(column_names) == 9, 'Number of columns has changed on site'

    sample_date = datetime.datetime.now().isoformat()

    for column_data in yield_row_data(etree):
        collected_data = dict(zip(column_names, column_data))
        collected_data['source_url'] = source_url
        collected_data['sample_date'] = sample_date 
        print json.dumps(collected_data)

    turbotlib.log("Run finished")
Example #10
0
def run_scraper():
    "Initialize and run"

    links = CompanyLinks(
        "%s%s%s" %(
            "https://www.og.decc.gov.uk/",
            "eng/fox/decc/PED301X/",
            "companyBlocksNav"
        )
    )
    for i, j in enumerate(links.get_clean_links()):
        info = CompanyInfo(j)
        try:
            turbotlib.log("progress: %s" % i)
            for record in info.scrape():
                print json.dumps(record)
        except AttributeError as e:
            ## Uncomment the following to see empty entries
            ## that cause failures
            #print "Fails at %s as %s" %(i, e)
            pass
Example #11
0
def retrieve(url, method, data, attempt=1):
    response = None
    connection_exception = False
    headers = {
        "X-MicrosoftAjax":
        "Delta=true",
        "Content-Type":
        "application/x-www-form-urlencoded; charset=UTF-8",
        "Accept":
        "*/*",
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Cache-Control":
        "no-cache",
        "Pragma":
        "no-cache"
    }

    try:
        req = requests.Request(method, url, data=data, headers=headers)
        prepared = req.prepare()
        response = session.send(prepared)

    except requests.exceptions.RequestException:
        connection_exception = True

    if (connection_exception
            or response.status_code != requests.codes.ok) and attempt <= 5:
        turbotlib.log(
            "There was a failure reaching or understanding the host, waiting and retrying..."
        )

        if response is not None and response.text is not None:
            turbotlib.log("Failure was: " + response.text)

        time.sleep(attempt * 5)
        return retrieve(url, method, data, attempt + 1)

    return response
Example #12
0
def parse_table(url):
    turbotlib.log("Parse " + url)

    # Load the page
    doc = BeautifulSoup(requests.get(HOST + url).content)
    
    # Find the right table for the data and the one for the links
    target_main = None
    for main in doc.find_all('td', class_='maincontent'):
        if len(main.find_all('table')):
            target_main = main
    target_tables = target_main.find_all('table')
    target_table = target_tables[0]
    links_table = target_tables[1]
    
    # Parse the data, skip header and footer
    for tr in target_table.find_all('tr')[1:-3]:
        tds = tr.find_all('td')
        record = {
            'Reg_and_C_of_R_Number': clean_up(tds[0].text),
            'Date_of_Issue_of_C_of_R': clean_up(tds[1].text),
            'Name_of_Owner_and_Address' : clean_up(tds[2].text),
            'type_and_serial_number' : clean_up(tds[2].text),
            'year_of_manufacture' : clean_up(tds[3].text),
            'all_up_mass_LBS_KGSS' : clean_up(tds[4].text),
            'category' : clean_up(tds[5].text),
            'engine_type' : clean_up(tds[6].text),
            'certificate_expiry_date' : clean_up(tds[7].text),
            'sample_date': datetime.datetime.now().isoformat(),
            'source_url': TARGET      
        }
        print (json.dumps(record))

    # Find links
    for td in links_table.find_all('td'):
        if clean_up(td.text) == 'Next':
            next_page = td.find('a', href=True)['href']
            parse_table(next_page)
Example #13
0
def parse_table(url):
    turbotlib.log("Parse " + url)

    # Load the page
    doc = BeautifulSoup(requests.get(HOST + url).content)

    # Find the right table for the data and the one for the links
    target_main = None
    for main in doc.find_all('td', class_='maincontent'):
        if len(main.find_all('table')):
            target_main = main
    target_tables = target_main.find_all('table')
    target_table = target_tables[0]
    links_table = target_tables[1]

    # Parse the data, skip header and footer
    for tr in target_table.find_all('tr')[1:-3]:
        tds = tr.find_all('td')
        record = {
            'Reg_and_C_of_R_Number': clean_up(tds[0].text),
            'Date_of_Issue_of_C_of_R': clean_up(tds[1].text),
            'Name_of_Owner_and_Address': clean_up(tds[2].text),
            'type_and_serial_number': clean_up(tds[2].text),
            'year_of_manufacture': clean_up(tds[3].text),
            'all_up_mass_LBS_KGSS': clean_up(tds[4].text),
            'category': clean_up(tds[5].text),
            'engine_type': clean_up(tds[6].text),
            'certificate_expiry_date': clean_up(tds[7].text),
            'sample_date': datetime.datetime.now().isoformat(),
            'source_url': TARGET
        }
        print(json.dumps(record))

    # Find links
    for td in links_table.find_all('td'):
        if clean_up(td.text) == 'Next':
            next_page = td.find('a', href=True)['href']
            parse_table(next_page)
Example #14
0
def process_pages(url):

    # Attempt to resume if we can
    try:
        page_number = turbotlib.get_var("page")
        record_count = turbotlib.get_var("check_count")
    except KeyError:
        page_number = 1
        record_count = None

    if page_number > 1:
        turbotlib.log("Resuming run from page {0}".format(page_number))

        with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump:
            for record in dump:
                print record
            dump.close()

    # iterate over whole or remaining data set
    while record_count is None or (page_number * 100 - 100) < record_count:
        turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100),
                                                   (page_number * 100)))

        # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp
        # However, not a problem and subsequent calls work as expected.
        response_text = process_page(url,
                                     2 if page_number == 1 else page_number)

        # Ensure the number of records haven't changed during run
        check_count = get_record_count(response_text)
        turbotlib.save_var("check_count", check_count)
        if record_count is not None and record_count != check_count:
            reset_state()
            raise Exception(
                "The data set changed during parsing, we need a re-run.")
        else:
            record_count = check_count

        if not record_count > 0:
            raise Exception("The data set is empty.")

        page_number += 1
        turbotlib.save_var("page", page_number)

    turbotlib.log("Run finished!")
    reset_state()
def process_pages(url):

    # Attempt to resume if we can
    try:
        page_number = turbotlib.get_var("page")
        record_count = turbotlib.get_var("check_count")
    except KeyError:
        page_number = 1
        record_count = None

    if page_number > 1:
        turbotlib.log("Resuming run from page {0}".format(page_number))

        with open('%s/records.dump' % turbotlib.data_dir(), "r") as dump:
            for record in dump:
                print record
            dump.close()

    # iterate over whole or remaining data set
    while record_count is None or (page_number * 100 - 100) < record_count:
        turbotlib.log("Requesting rows %d - %d" % ((page_number * 100 - 100), (page_number * 100)))

        # Strange behavior on server: first call returns page 1 results but page must be > 1 to not get null resp
        # However, not a problem and subsequent calls work as expected.
        response_text = process_page(url, 2 if page_number == 1 else page_number)

        # Ensure the number of records haven't changed during run
        check_count = get_record_count(response_text)
        turbotlib.save_var("check_count", check_count)
        if record_count is not None and record_count != check_count:
            reset_state()
            raise Exception("The data set changed during parsing, we need a re-run.")
        else:
            record_count = check_count

        if not record_count > 0:
            raise Exception("The data set is empty.")

        page_number += 1
        turbotlib.save_var("page", page_number)

    turbotlib.log("Run finished!")
    reset_state()
Example #16
0
	{'type': 'Exchange broker', 'value': 'corredores'}
]

#FUNCTIONS
#retrieve a document at a given URL as parsed html tree
def get_doc(source_url, extra_parameters={}):
	post_value = {"Pagina": "1"} #need to override post values on certain pages to avoid automatic redirection
	post_value.update(extra_parameters)
	response = requests.post(source_url, post_value)
	html = response.content
	doc = BeautifulSoup(html)
	return doc

#get going
sample_date = unicode(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#****

#SECTION TWO: FINANCIAL INSTITUTIONS - COLLATE LIST OF ENTITIES AND THEN PROCESS DETAILS OF EACH ENTITY
turbotlib.log("")
turbotlib.log("**** FINANCIAL INSTITUTIONS ****")
turbotlib.log("")

#scrape list of financial institutions to look at
financial_institutions = [] #list to store the ones we find
for list_url in institution_urls:
	turbotlib.log("Loading list of " + list_url['type_of_institution'] + "s")
	# try:
	financial_institution_list_page = get_doc(list_url['url'])
	financial_institution_list = financial_institution_list_page.find("table", attrs={"class": "Tabla_Borde"})
Example #17
0
# -*- coding: utf-8 -*-

import json
import datetime
import turbotlib
import requests
import lxml.html

BASE_URL = "http://license.reg.state.ma.us/public/licque.asp?query=business&color=&board="
SEARCH_URL = "http://license.reg.state.ma.us/public/pubLicRange.asp?profession=%s&busname=_&buscity=&querytype=business"
URL_BASE = "http://license.reg.state.ma.us/public/"

turbotlib.log("Starting run...")

# Start a requests session
s = requests.session()


def get_business_types(url):
    """Gets the available business types in the search form.
    Returns a list"""
    response = s.get(url)
    root = lxml.html.fromstring(response.text)
    options = root.xpath("//select[@name='profession']/option")
    return [option.text.strip() for option in options]


def parse_business_licenses(html):
    root = lxml.html.fromstring(html)
    trs = root.xpath("//table[@id='tableresults']/tbody/tr")
    for tr in trs:
Example #18
0
# -*- coding: utf-8 -*-

import json
import datetime
import turbotlib
import requests
import re
from bs4 import BeautifulSoup

turbotlib.log("Starting run...")  # Optional debug logging

HOST = "http://www.tcaa.go.tz/"
TARGET = "aircraft_register.php"


def clean_up(string):
    return re.sub(' +', ' ', string.replace('\r\n', ' ')).strip()


def parse_table(url):
    turbotlib.log("Parse " + url)

    # Load the page
    doc = BeautifulSoup(requests.get(HOST + url).content)

    # Find the right table for the data and the one for the links
    target_main = None
    for main in doc.find_all('td', class_='maincontent'):
        if len(main.find_all('table')):
            target_main = main
    target_tables = target_main.find_all('table')
Example #19
0
# -*- coding: utf-8 -*-

import codecs
import datetime
import json
import requests
import turbotlib

from bs4 import BeautifulSoup

turbotlib.log('Starting run...')

def get_soup(url, session=None):
    if not session:
        session = requests.Session()
    turbotlib.log('Getting soup for %s' % url)
    response = session.get(url)
    html = response.content
    return BeautifulSoup(html)

session = requests.Session()
sample_date = str(datetime.date.today())
base_url ='http://www.knf.gov.pl'

# This is the first page in about 20~ paginated tables
# Obscure the url from search engines since code may end up on github
target_url = codecs.decode('uggc://jjj.xas.tbi.cy/cbqzvbgl/svaqVaQrgnvy.npgvba?pglcr=Onaxv+fc%P3%O3%P5%82qmvrypmr&nwnk=gehr&enaqbz=0.20882477751001716&co.fgneg=0', 'rot_13')
while target_url:
    soup = get_soup(target_url, session)
    for tr in soup.find_all('tr')[1:]:
        data = {
Example #20
0
				td_index += 1

			if (len(item) > 0):
				items.append(item)
	except:
		pass

	return items

#urls to use
base_href = "http://www.cnvmr.ro/asf/registru/"
front_url = base_href + "lista.php?listasect=1&lng=2"

#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#Step 1: extract list of categories from front page
try:
	categories = [] #store the list as we find them
	front_page = get_doc(front_url)
	category_list = front_page.find("table", id="listaEntitati")
	category_rows = category_list.find_all("tr")

	current_category = None #maintain link to current category

	for row in category_rows:
		td_list = row.find_all("td")

		#deal only with non-empty rows
		if (len(td_list) > 0):
Example #21
0
 				#record anything with a non-blank value
 				td_text = td.text.strip().replace("\n", " ").replace("\t", "")
 				if (len(td_text) > 0):
 					td_index = td_list.index(td)
 					header = headers[td_index]
 					record[header] = td_text.strip().replace("\n", " ").replace("\t", "")

 			#check we found something
 			if (len(record) > 3):
 				word_records.append(record)
 	
 	return word_records

#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#first load the excel files
for source in sources:
	turbotlib.log("Loading file " + str(sources.index(source) + 1) + "/" + str(len(sources)))
	source_page = get_doc(source['url'])
	source_links = source_page.find_all("a")
	for link in source_links:
		#find the link on the page which leads to the right 
		if (source['file'] == "excel"):
			if ('.xls' in link['href']):
				#can't use formatting information for more recent file formats in xlrd
				if (('.xlsx' in link['href']) or ('.xlsb' in link['href'])):
					formatting = False
				else:
					formatting = True
Example #22
0
def parse_entity(entity_id):
	turbotlib.log("Parsing entity " + entity_id)
	try:
		entity_page = get_doc(detail_url + entity_id)

		#create object to store output
		output = {
			'sample_date': sample_date,
			'source_url': detail_url + entity_id,
			'source': "Securities Commission, Malaysia"
		}
		added_info = False

		#now get general info
		name = entity_page.find(id="StdPageLayout1_lblLicenceHolder").text.strip()
		if (len(name) > 0):
			output['name'] = name
			added_info = True

		licence_number = entity_page.find(id="StdPageLayout1_lblLicenceNo").text.strip()
		if (len(licence_number) > 0):
			output['licence_number'] = licence_number
			added_info = True

		regulated_activity_list = entity_page.find(id="StdPageLayout1_lblRegulatedAct")
		regulated_activity = []
		for item in regulated_activity_list.stripped_strings:
			activity = item.replace("&nbsp&nbsp-", "").strip()
			regulated_activity.append(activity)
		if (len(regulated_activity) > 0):
			output['regulated_activities'] = regulated_activity
			added_info = True

		start_date = entity_page.find(id="StdPageLayout1_lblLicenceSince").text.strip()
		if (len(start_date) > 0):
			output['start_date'] = start_date
			added_info = True

		anniversary_date = entity_page.find(id="StdPageLayout1_lblAnniversaryDate").text.strip()
		if (len(anniversary_date) > 0):
			output['anniversary_date'] = anniversary_date
			added_info = True

		status = entity_page.find(id="StdPageLayout1_lblStatus").text.strip()
		if (len(status) > 0):
			output['status'] = status
			added_info = True

		licensed_reps = entity_page.find(id="StdPageLayout1_lblNoOfLicenceRep").text.strip()
		if (len(licensed_reps) > 0):
			output['number_of_licensed_representatives'] = licensed_reps
			added_info = True

		#then go through specific tabs. first up, is licences
		licences = []
		licence_table = entity_page.find(id="tabs-1").table
		licence_rows = licence_table.find_all("tr")
		for tr in licence_rows[1:]:
			td_list = tr.find_all("td")
			licence = {} #make an object then see if we need to add it to the list
			#first off, licence number
			number = td_list[0].text.strip()
			if (len(number) > 0):
				licence['number'] = number
			#second cell: list of activities
			activities = []
			for item in td_list[1].stripped_strings:
				activity_string = item.replace("&nbsp&nbsp-", "").replace(u"•", "").strip()
				if (len(item) > 0):
					activities.append(activity_string)
			if (len(activities) > 0):
				licence['activities'] = activities
			#third cell: anniversary date
			anniversary_date = td_list[2].text.strip()
			if (len(anniversary_date) > 0):
				licence['anniversary_date'] = anniversary_date
			#fourth cell: status
			status = td_list[3].text.strip()
			if (len(status) > 0):
				licence['status'] = status

			#now add to the result
			if (len(licence) > 0):
				licences.append(licence)

		#now append to list
		if (len(licences) > 0):
			output['licences'] = licences
			added_info = True

		#second tab: Associate persons
		associates = []
		associates_table = entity_page.find(id="tabs-3").table
		associates_rows = associates_table.find_all("tr")
		for tr in associates_rows[1:]:
			associate = {}
			td_list = tr.find_all("td")
			#first cell = name, second cell = designation, third cell = sub-designation
			name = td_list[0].text.strip()
			if (len(name) > 0):
				associate['name'] = name
			designation = td_list[1].text.strip()
			if (len(designation) > 0):
				associate['designation'] = designation
			sub_designation = td_list[2].text.strip()
			if (len(sub_designation) > 0):
				associate['subdesignation'] = sub_designation
			if (len(associate) > 0):
				associates.append(associate)
		if (len(associates) > 0):
			output['associate_persons'] = associates
			added_info = True

		#third tab: business address
		address_table = entity_page.find(id="tabs-4").table
		address_rows = address_table.find_all("tr")
		for tr in address_rows[1:]:
			td_list = tr.find_all("td")
			label = td_list[0].text.strip().lower().replace(" ", "_")

			#address is different - need to extract it line-by-line
			if (label == "address"):
				address_lines = []
				for line in td_list[1].stripped_strings:
					address_lines.append(line)
				address = ", ".join(address_lines)
				if (len(address) > 0):
					output['address'] = address
					added_info = True

			else:
				value = td_list[1].text.strip()
				if ((len(value) > 0) and (len(label) > 0)):
					output[label] = value
					added_info = True

		#fourth tab: name changes
		name_change_table = entity_page.find(id="tabs-5").table
		name_change_rows = name_change_table.find_all("tr")
		name_changes = []
		for tr in name_change_rows[1:]:
			td_list = tr.find_all("td")
			if (len(td_list) == 2):
				effective_date = td_list[0].text.strip()
				previous_name = td_list[1].text.strip()
				if ((len(previous_name) > 0) and (len(effective_date) > 0)):
					name_change = {
						'previous_name': previous_name,
						'effective_date': effective_date
					}
					name_changes.append(name_change)
		if (len(name_changes) > 0):
			output['previous_names'] = name_changes
			added_info = True

		#fifth tab: licensed reps
		reps_table = entity_page.find(id="tabs-6").table
		reps_rows = reps_table.find_all("tr")
		reps = []
		for tr in reps_rows[1:]:
			rep = {}
			td_list = tr.find_all("td")
			name = td_list[0].text.strip()
			if (len(name) > 0):
				rep['name'] = name
			licence_number = td_list[1].text.strip()
			if (len(licence_number) > 0):
				rep['licence_number'] = licence_number
			#extra info in hyperlink for name
			if (td_list[0].a != None):
				href = td_list[0].a['href']
				licence_id_start = href.find("=")
				licence_id_end = href.find("&")
				licence_id = href[licence_id_start + 1: licence_id_end]
				if (len(licence_id) > 0):
					rep['licence_id'] = licence_id
					rep['detail_url'] = "http://ers.seccom.com.my/public/LicenceGeneralInfo.aspx?LicenceID=" + licence_id
			if (len(rep) > 0):
				reps.append(rep)
		if (len(reps) > 0):
			output['licensed_representatives'] = reps
			added_info = True

		#got to the end, save the results
		if (added_info):
			print(json.dumps(output))

	except:
		pass
Example #23
0
def openPage(browser, visit, controls, identifier, parse):
	browser.form.set_all_readonly(False)
	for cname in controls:
		c = controls[cname]
		if c is None:
			for control in browser.form.controls:
				if control.name == cname:
					browser.form.controls.remove(control)
	for cname in controls:
		c = controls[cname]
		if c is not None: 
			exists = True
			try:
				find_ctrl = browser.form.find_control(cname)				
			except mechanize._form.ControlNotFoundError:
				exists = False
			if exists is True and cname.find('bnConnectionTemplate:r1:0:s11:selectedStatuses') is -1:
				#find_ctrl.readonly = False
				try:
					#browser.form[cname] = c
					find_ctrl.value = c
				except Exception as e:
					#print e
					try: 
						#print find_ctrl
					except:
						turbotlib.log("did not find form input " + cname)

					#turbotlib.log("did not find form input " + cname)
					try: 
						#print find_ctrl
					except:
						turbotlib.log("did not find form input " + cname)
			else:
				if cname.find('bnConnectionTemplate:r1:0:s11:selectedStatuses') is not -1:
					browser.form.new_control('hidden','bnConnectionTemplate:r1:0:s11:selectedStatuses', {'value': c, 'checked': True})
				else:
					browser.form.new_control('hidden',cname, {'value': c})
	browser.form.fixup()
	turbotlib.log("Starting Request...")
	try:
		if visit is True:
			response = browser.open(browser.form.click(), timeout = 60)
			#print urllib.unquote(browser.request.get_data()).decode('utf8')
		else:
			response = browser.open_novisit(browser.form.click(), timeout = 60)
		response_content = response.read()
		#print response_content
	except:
		turbotlib.log("Bad Request. Starting over " + bankType + " Category")
		return False
	response.close()
	turbotlib.log("Response Received...")
	if parse is True:
		html = unwrap(response_content, identifier)
		if html is not None:
			result = parseResult(html)
			return result
		else:
			#print response_content
			return None
	return None
Example #24
0
# -*- coding: utf-8 -*-

import turbotlib

from scrape_banks import scrape_banks
from scrape_foreign import scrape_foreign
from scrape_imf import scrape_imf
from scrape_revoked import scrape_revoked


turbotlib.log("Starting run")

turbotlib.log("\nScraping banks :")
scrape_banks()

turbotlib.log("\nScraping IMFs :")
scrape_imf()

turbotlib.log("\nScraping foreign banks :")
scrape_foreign()

turbotlib.log("\nScraping revoked banks :")
scrape_revoked()

Example #25
0
				
				#make an object and add to the list
				result = {
					'name': item_name,
					'idx': item_idx
				}
				results.append(result)
			except:
				continue
	return results



#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#Base URLs we will want to use
category_url = "http://www.fi.se/Folder-EN/Startpage/Register/Company-register/Company-register-Company-per-category/?typ='" #need to finish with category and "'"
company_url = "http://www.fi.se/Folder-EN/Startpage/Register/Company-register/Company-register-Details/?idx=" #need to finish with idx number
company_se_url = "http://www.fi.se/Register/Foretagsregistret/Foretagsregistret-Detaljerad-information/?idx=" #need to finish with idx number
overseas_permissions_url = "http://www.fi.se/Register/Foretagsregistret/Foretagsregistret-Gransoverskridande-handel/?idx=" #need to finish with idx number

#keep track of progress
count = 1

#These are the categories of instituion - use these to populate category urls and find the relevant entries
categories = [
	"BANK++", #Banking companies (limited liability company)
	"FILB++", #Foreign branches of Swedish chartered banks
	"MBANK+", #Members-bank
Example #26
0
	{'url': "http://www.garfin.org/agents.html", 'category': "Insurance agent"},
	{'url': "http://www.garfin.org/salespersons.html", 'category': "Insurance salesperson"},
	{'url': "http://www.garfin.org/moneyservices.html", 'category': "Money services"}
]

#FUNCTIONS
#retrieve a document at a given URL as parsed html tree
def get_doc(source_url):
	response = requests.get(source_url)
	html = response.content
	doc = BeautifulSoup(html)
	return doc

#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#go through the pages one by one
for source in source_urls:
	#monitor progress
	count = source_urls.index(source) + 1
	turbotlib.log("Parsing category " + str(count) + "/" + str(len(source_urls)))

	#load page
	source_page = get_doc(source['url'])
	entities = source_page.find(attrs={"class": "deptContent"}).ol.find_all("li")
	
	#now go through the names one by one
	for entity in entities:
		#make object to store result temporarily
		output = {
Example #27
0
	{'url': "http://www.nbs.rs/static/nbs_site/gen/english/60/60b4_en.htm", 'category': "Agency or outlet whose operating licence has been revoked", 'basehref': None},
	{'url': "http://www.nbs.rs/static/nbs_site/gen/english/60/tagencije.htm", 'category': "Legal entity in charge of insurance agency and brokerage pursuant to special law", 'basehref': None}
]

#translation
countries = {
	'MOSKVA': "Russia",
	'PODGORICA': "Montenegro",
	'SOUTH DAKOTA': "South Dakota",
	'FRANKFURT/MAIN': "Germany",
	'SKOPJE': "Republic of Macedonia"
}

#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#Step 1: load page for each category to identify who needs looking at
for entity_list in entity_lists:
	#monitor progress
	list_count = entity_lists.index(entity_list) + 1
	turbotlib.log("Starting category " + str(list_count) + "/" + str(len(entity_lists)))

	#load page
	list_page = get_doc(entity_list['url'])
	list_table = list_page.table
	
	#first off, if this category doesn't have links on its front page, there's nothing more to do
	if (entity_list['basehref'] == None):
		#deal with one weird template first
		if (entity_list['category'] == "Legal entity in charge of insurance agency and brokerage pursuant to special law"):
Example #28
0
    return name.strip()


def parse_governors(cell):
    governors = dict()
    for p in cell("p"):
        title_and_name = detag(p).split(':', 1)
        print(title_and_name)
        governors[title_and_name[0].strip()] = title_and_name[1].strip()
    return governors


# The printer-friendly version of the page is much easier to parse
source_url = "http://nbt.tj/en/banking_system/credit_org.php?print=Y"
sample_date = str(date.today())
turbotlib.log("Starting scrape...")  # Optional debug logging
response = requests.get(source_url)
html = response.content
doc = BeautifulSoup(html, "lxml")
tables = [table for table in doc.table.table("table")]
rows = [tr for table in tables for tr in table("tr")][1:]  # skip the header

institution_type = "Bank"
for row in rows:
    cells = row("td")

    if len(cells) is 2:
        institution_type = cells[1].find(BOLD_RE).string
        continue

    # cells[0] just contains a line number, skipping
Example #29
0
# -*- coding: utf-8 -*-

import codecs
import datetime
import json
import re
import requests
import turbotlib


turbotlib.log('Starting run...')

# Obscure the url from search engines since code may end up on github
source_url = codecs.decode('uggc://jjj.nre.pn/qngn/pbajryy/PbaJryy.gkg', 'rot_13')
response = requests.get(source_url, timeout=20)

# So this is a plain text file, not delineated at all, luckily they have
# a semi header which is a bunch of equal signs split apart by two columns
column_starts = []
column_names = (
    'Well Location',
    'Licence Number',
    'Licensee Code and Name',
    'Confidential Type',
    'Conf. Below Frmtn',
    'Conf. Release Date',
)

if response.status_code == 200:
    turbotlib.log('Valid response received...')
    for line in response.iter_lines():
Example #30
0
# -*- coding: utf-8 -*-

import json
import datetime
import re
import turbotlib

from bs4 import BeautifulSoup
import requests

turbotlib.log("Starting run...")


class Entry(object):
    def __init__(self):
        self.sample_date = str(datetime.date.today())
        self.source_url = ''
        self.name = ''
        self.tel = ''
        self.fax = ''
        self.telex = ''
        self.swift = ''
        self.box = ''
        self.reuters = ''
        self.url = ''
        self.ceo_tel = ''
        self.ceo_fax = ''
        self.country = ''


patterns = (
Example #31
0
			if (len(rep) > 0):
				reps.append(rep)
		if (len(reps) > 0):
			output['licensed_representatives'] = reps
			added_info = True

		#got to the end, save the results
		if (added_info):
			print(json.dumps(output))

	except:
		pass

#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#Step 1: make a list of letter pairs to iterate through, and work out which entities are in the database
entities = [] #store results so we don't do them twice
letter_pairs = [] #store what we're iterating over
for first_letter in string.lowercase:
	for second_letter in string.lowercase:
		letter_pair = first_letter + second_letter
		turbotlib.log("Now searching for pair '" + letter_pair + "'")
		try:
			search_page = get_doc(search_url + letter_pair)
			rows = search_page.find_all(attrs={"class": "stdrow"}) + search_page.find_all(attrs={"class": "stdrow-1"})

			for row in rows:
				td_list = row.find_all("td")
				for td in td_list:
Example #32
0
def parse_page(layout, config=None):
    xset, yset = set(), set()
    tlines = []
    objstack = list(reversed(layout._objs))
    while objstack:
        b = objstack.pop()
        if type(b) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
            objstack.extend(reversed(
                b._objs))  # put contents of aggregate object into stack
        elif type(b) == LTTextLineHorizontal:
            tlines.append(b)
        elif type(b) in [LTLine]:
            if b.x0 == b.x1:
                xset.add(b.x0)
            elif b.y0 == b.y1:
                yset.add(b.y0)
            else:
                print "sloped line", b
        elif type(b) in [LTRect]:
            if b.x1 - b.x0 < 2.0:
                xset.add(b.y0)
            else:
                yset.add(b.x0)
        elif type(b) == LTImage:
            continue
        else:
            turbotlib.log('Unregognized type: %s' % type(b))

    xlist = sorted(list(xset))
    ylist = sorted(list(yset))

    # initialize the output array of table text boxes
    boxes = [[[] for xl in xlist] for yl in ylist]

    for lt in tlines:
        y = (lt.y0 + lt.y1) / 2
        iy = Wposition(ylist, y)
        previx = None
        for lct in lt:
            if type(lct) == LTAnno:
                continue  # a junk element in LTTextLineHorizontal
            x = (lct.x0 + lct.x1) / 2
            ix = Wposition(xlist, x)
            if previx != ix:
                boxes[iy][ix].append([])  # begin new chain of characters
                previx = ix
            boxes[iy][ix][-1].append(lct.get_text())
    for iy in range(len(ylist)):
        for ix in range(len(xlist)):
            boxes[iy][ix] = ["".join(s) for s in boxes[iy][ix]]

    if 'remove' in config:
        del boxes[config['remove']:]

    headers = ["".join(lh.strip() for lh in h).strip() for h in boxes.pop()]
    try:
        assert headers == config['headers']
    except AssertionError:
        turbotlib.log('Headers: %s' % headers)
        turbotlib.log('Headers (config): %s' % config['headers'])

    # merge entries where needed
    if config['merge']:
        name_column_index = headers.index(config['name_column_name'])
        unique_column_index = headers.index(config['unique_column_name'])
        for i, entry in enumerate(boxes):
            if headers[name_column_index +
                       1] == '' and entry[name_column_index + 1]:
                boxes[i][name_column_index][1:1] = boxes[i][name_column_index +
                                                            1]

        for i, entry in enumerate(boxes):
            if (len(entry[unique_column_index]) == 0
                    or entry[unique_column_index][0].strip()
                    == '') and boxes[i + 1]:
                # if headers[name_column_index+1] == '' and boxes[i+1][name_column_index+1]:
                #      boxes[i+1][name_column_index].extend(boxes[i+1][name_column_index+1])
                if len(
                        entry[name_column_index]
                ) > 0 and entry[name_column_index][0] != config['total_title']:
                    boxes[i + 1][name_column_index].extend(
                        entry[name_column_index])
                for idx in config['merge_indexes']:
                    if len(entry[idx]) > 0:
                        boxes[i + 1][idx].extend(entry[idx])

    box_list = []
    for row in boxes:
        if (row[0] != ''):
            box_list.append(dict(zip(headers, ["".join(s) for s in row])))

    return box_list
Example #33
0
	turbotlib.log("Response Received...")
	if parse is True:
		html = unwrap(response_content, identifier)
		if html is not None:
			result = parseResult(html)
			return result
		else:
			#print response_content
			return None
	return None


###########################


turbotlib.log("Starting run...") # Optional debug logging

country = "Australia"


# First, get the list of categories from the form search dropdown
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 
'User-Agent' : user_agent,
'Accept' : "*/*"
}
start_url = "https://connectonline.asic.gov.au/RegistrySearch/faces/landing/ProfessionalRegisters.jspx"
url_domain = 'https://connectonline.asic.gov.au'

browser = mechanize.Browser()
browser.set_handle_robots(False)   # ignore robots
Example #34
0
# -*- coding: utf-8 -*-

import json
import datetime
import turbotlib

turbotlib.log("Starting run...") # Optional debug logging

for n in range(0,20):
    data = {"number": n,
            "message": "Hello %s" % n,
            "sample_date": datetime.datetime.now().isoformat(),
            "source_url": "http://somewhere.com/%s" % n}
    # The Turbot specification simply requires us to output lines of JSON
    print json.dumps(data)
Example #35
0
def parse_excel(excel_content, formatting, source_url, category):
	#load in the document
	document = xlrd.open_workbook(file_contents=excel_content, formatting_info=formatting)
	excel_records = []

	for sheet_num in xrange(0, document.nsheets):
		sheet = document.sheet_by_index(sheet_num)
		turbotlib.log("Processing sheet " + str(sheet_num + 1) + "/" + str(document.nsheets))

		# skip sheet if top-left is blank
		check_cells = []
		for row in xrange(0, min(10, sheet.nrows)):
			for col in xrange(0, min(10, sheet.ncols)):
				check_cell = unicode(sheet.cell_value(row, col)).strip()
				if (len(check_cell) > 0):
					check_cells.append(check_cell)
		if (len(check_cells) == 0):
			continue

		#find the start of the headers - where there is something in column A and column B
		header_start_row = -1
		for row in xrange(0,sheet.nrows):
			if (len(sheet.cell_value(row, 0)) > 0):
				if (len(sheet.cell_value(row, 1)) > 0):
					header_start_row = row
					break

		#try to find out the end of the headers using formatting information. however, this won't work for xlsx or xlsb files due to xlrd's limitations
		header_end_row = -1
		if (formatting):
			#find the end of the headers - first attempt is where the background colour changes
			header_xf_index = sheet.cell_xf_index(header_start_row, 0)
			header_xf = document.xf_list[header_xf_index]
			header_background = header_xf.background.background_colour_index
			#go through all subsequent rows until we find one with a different background colour
			for row in xrange(header_start_row, sheet.nrows):
				row_xf_index = sheet.cell_xf_index(row, 0)
				row_xf = document.xf_list[row_xf_index]
				row_background = row_xf.background.background_colour_index
				if (row_background != header_background):
					header_end_row = row - 1
					break
			
			#second attempt - if that didn't work, find out where it changed from bold to unbold
			if (header_end_row == -1):
				for row in xrange(header_start_row, sheet.nrows):
					row_xf_index = sheet.cell_xf_index(row, 0)
					row_xf = document.xf_list[row_xf_index]
					row_font_index = row_xf.font_index
					row_font = document.font_list[row_font_index]
					if (row_font.weight == 400): #standard weight is 400 for normal, 700 for bold
						header_end_row = row - 1
						break
		
		#otherwise have to try to infer it from other factors- first use of number '1' in column A
		else:
			for row in xrange(header_start_row, sheet.nrows):
				if (sheet.cell_value(row, 0) == 1):
					header_end_row = row - 1

		#now work out how many columns we have in the final headers row - look for column with no values in the header rows
		header_end_col = -1
		for col in xrange(0, sheet.ncols):
			#check if all header rows in this column are blank (which they will be for merged cells, except for the first column)
			cell_contents = []
			for row in xrange(header_start_row, header_end_row + 1):
				cell_content = unicode(sheet.cell_value(row, col)).strip()
				#if we have formatting information and a blank cell, check for merged cells
				if (formatting and (len(cell_content) == 0)):
					for merge_range in sheet.merged_cells:
						r_low, r_high, c_low, c_high = merge_range
						#check if our cell is in the middle of a merged row
						if ((row >= r_low) and (row <= r_high)):
							if ((col >= c_low) and (col <= c_high)):
								cell_content = unicode(sheet.cell_value(r_low, c_low)).strip()
				#add our result to the list
				cell_contents.append(cell_content)

			#check if all are empty/blank
			empty_count = 0
			for cell in cell_contents:
				if (len(cell) == 0):
					empty_count += 1
			if (empty_count == len(cell_contents)):
				header_end_col = col - 1
				break	

		#fallback - if didn't find an end, then use ncols
		if (header_end_col == -1):
			header_end_col = sheet.ncols - 1

		#now we know where the headers are - time to find the end of the data
		data_start_row = header_end_row + 1 #starts one after the headers, unsurprisingly
		data_end_row = -1
		for row in xrange(data_start_row, sheet.nrows):
			#go through all columns - first row where they're all blank means you've reached the end
			row_contents = []
			for col in xrange(0, header_end_col):
				cell_content = unicode(sheet.cell_value(row, col)).strip()
				row_contents.append(cell_content)
			empty_count = 0
			for cell in row_contents:
				if (len(cell) == 0):
					empty_count += 1
			if (empty_count == len(row_contents)):
				data_end_row = row - 1
				break

		if (data_end_row == -1):
			data_end_row = sheet.nrows - 1

		#extract the headers - taking account of merged cells
		headers = []
		for col in xrange(0, header_end_col):
			#combine all headers in a column into one string
			header_cells = []
			for row in xrange(header_start_row, header_end_row + 1):
				check_col = col
				header_string = ""
				while ((len(header_string) == 0) and (check_col > 0)):
					header_string = unicode(sheet.cell_value(row, check_col)).strip()
					check_col -= 1 #go back a column to get the value of a merged cell
				if (len(header_string) > 0):
					header_cells.append(header_string)

			#cope with merged cells at the end - just take the main value
			if ((len(header_cells) > 0) and (header_cells[0] == header_cells[-1])):
				header = header_cells[0]
			else:
				if (category == "Banking operation"):
					end = -1
				else:
					end = len(header_cells)
				header = " - ".join(header_cells[:end])
			headers.append(header)

		#now get the data
		for row in xrange(data_start_row, data_end_row + 1):
			#one record per row - with metadata
			record = {
				'sample_date': sample_date,
				'source_url': source_url,
				'source_sheet': sheet.name,
				'category': category
			}
			#load in value for each column
			for col in xrange(0, header_end_col):
				label = headers[col].replace("\n", " ").replace("\t", "").replace("  ", "")
				if (len(label) == 0):
					label = "id"
				if (label == "Name"):
					label = "name"
				value = unicode(sheet.cell_value(row, col)).strip().replace("\n", " ").replace("\t", "").replace("  ", "")
				if (len(value) > 0):
					record[label] = value
			excel_records.append(record)

	#spit it out at the end
	return excel_records
Example #36
0
	{'url': "http://asfro.ro/em/ra/registru_en.php?reg=as", 'country': "Romania", 'category': "Section A - Insurance undertakings"},
	{'url': "http://asfro.ro/em/ra/registru_en.php?reg=ab", 'country': "Romania", 'category': "Section A - Intermediaries"},
	{'url': "http://asfro.ro/em/ra/registru_en.php?reg=bs", 'country': "Romania", 'category': "Section B - Insurance undertakings"},
	{'url': "http://asfro.ro/em/ra/registru_en.php?reg=bb", 'country': "Romania", 'category': "Section B - Intermediaries"},
	{'url': "http://asfro.ro/em/cs/cautare_en.php?tip=s&mod=d", 'country': "Overseas", 'category': "Insurance undertakings and intermediaries from EEA"}
]

#different pages needed depending on registry used
detail_urls = {
	'Romania': "http://asfro.ro/em/ra/detalii_en.php?cod=",
	'Overseas': "http://asfro.ro/em/cs/detalii_en.php?cod="
}

#get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date) # Optional debug logging

#Step 1: load index page and see how many pages there are in this category
for category in entity_lists:
	try:
		turbotlib.log("Parsing category " + category['category'])
		category_page = get_doc(category['url'])
		
		#identify the relevant links and then see which has the highest number
		link_list = category_page.find_all("a")
		highest_link = 1
		for link in link_list:
			if (link['href'][0] == "?"): #these ones start with a question mark
				link_text = link.string.strip()
				if (link_text.isnumeric()):
					page_number = int(link_text)
Example #37
0
    turbotlib.log('There are %s links to check' % len(day_links))
    return day_links


def get_soup(url, session=None):
    turbotlib.log('Fetching %s' % url)
    if not session:
        session = requests.Session()
    response = session.get(url)
    html = response.content
    return BeautifulSoup(html)


try:
    last_seen_ref = turbotlib.get_var('last_seen_ref')
    turbotlib.log('last_seen_ref: ' + last_seen_ref)
except KeyError:
    turbotlib.log('Unknown last_seen_ref, start from the beginning')
    last_seen_ref = '00000000'


source_url = codecs.decode('uggc://jjj.nre.pn/qngn-naq-choyvpngvbaf/npgvivgl-naq-qngn/fg97', 'rot_13')
session = requests.Session()
source_soup = get_soup(source_url, session)
daily_links = find_daily_links(source_soup, last_seen_ref)

for date_ref, url in daily_links:
    data = {
        'sample_date': str(datetime.date.today()),
        'source_url': url,
        'date': datetime.datetime.strptime(date_ref, '%Y%m%d').strftime('%Y-%m-%d')  # this is the date the data was released not when it was scraped
Example #38
0
# converts date to yyyy-mm-dd or returns original string
def parse_date(value):
    if "/" in value:
        value_parts = value.split("/")
        if len(value_parts) == 3:
            return value_parts[2] + "-" + value_parts[0].zfill(2) + "-" + value_parts[1].zfill(2)
        else:
            return value
    else:
        return value


# get going
sample_date = str(date.today())
turbotlib.log("Starting run on " + sample_date)  # Optional debug logging

# load the main details from the csv file
csv_doc = requests.get(csv_url).content.splitlines()
reader = csv.DictReader(csv_doc)
for row in reader:
    # create an object to store the details
    output = {"source_url": csv_url, "sample_date": sample_date, "source": "Central Bank of Nigeria"}

    # add new data to the object
    for field, value in row.items():
        if field is not None:
            if len(str(value)) > 0:
                if value != "1/1/1900":
                    # convert key to standard jsonish type
                    key_name = str(field).lower().replace(" ", "_")
Example #39
0
        turbotlib.save_var("page", page_number)

    turbotlib.log("Run finished!")
    reset_state()


def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d


# ----------------------------------------------------------------------------------------------------------------------

turbotlib.log("Starting run...")

# create individuals cache
usersDB = sqlite3.connect('%s/individuals.db' % turbotlib.data_dir())
usersDB.row_factory = dict_factory
usersDB.execute(
    "CREATE TABLE IF NOT EXISTS individuals(jurisdiction, name, firm, terms, contact, categories)"
)
usersDB.commit()

turbotlib.log("Getting initial view state...")
init_req = retrieve(url_start, "GET", "")
document = BeautifulSoup(init_req.text)

last_view_state = urllib.quote(document.find(id='__VIEWSTATE')['value'])
last_validation = urllib.quote(document.find(id='__EVENTVALIDATION')['value'])
Example #40
0
def get_registered_individuals(url, control_href, view_state,
                               firm_jurisdiction, firm_name):
    return_array = []
    turbotlib.log("Retrieving individuals for current or historical firm: " +
                  firm_name + " in: " + firm_jurisdiction)

    control_id = urllib.quote(
        control_href.replace("javascript:__doPostBack('",
                             '').replace("','')", ''))
    individuals_page_req = retrieve(
        url, "POST", generate_body_control(control_id, view_state))

    if "Your search returned no records, please try searching again" in individuals_page_req.text:
        return []

    num_individuals = get_record_count(individuals_page_req.text)
    processed_individuals = 0
    last_processed_individuals = 0
    ind_page = 1

    while True:
        individuals_view_state = {
            'view':
            urllib.quote(
                get_asp_resp_var(individuals_page_req.text, "__VIEWSTATE")),
            'validation':
            urllib.quote(
                get_asp_resp_var(individuals_page_req.text,
                                 "__EVENTVALIDATION")),
            'generator':
            urllib.quote(
                get_asp_resp_var(individuals_page_req.text,
                                 "__VIEWSTATEGENERATOR"))
        }

        individual_links = BeautifulSoup(
            individuals_page_req.text).select('tr > td > a')
        for link in individual_links:
            try:
                if "lbtnIndDetail" not in link['href']:
                    continue
            except:
                continue

            processed_individuals += 1

            name = link.text.strip()
            individual_dict = get_individual(name, firm_jurisdiction,
                                             firm_name)

            if individual_dict is None:
                get_and_store_individuals_for_firm(link, url,
                                                   individuals_view_state,
                                                   name)

                individual_dict = get_individual(name, firm_jurisdiction,
                                                 firm_name)
                if individual_dict is not None:
                    return_array.append(individual_dict)

            else:
                return_array.append(individual_dict)

        if processed_individuals < num_individuals:
            if last_processed_individuals == processed_individuals:
                turbotlib.log(
                    'Warning: broke out of possible infinite loop trying to retrieve all individuals for firm.'
                )
                break

            ind_page += 1
            control_id = urllib.quote(
                'ctl00$bodyContent$lbtnPager{0}'.format(ind_page))
            individuals_page_req = retrieve(
                url, "POST",
                generate_body_control(control_id, individuals_view_state))

            last_processed_individuals = processed_individuals
        else:
            break

    return return_array
Example #41
0
import datetime
import turbotlib
import requests
import urlparse
import re
from BeautifulSoup import BeautifulSoup

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTTextLineHorizontal, LTTextBoxHorizontal, LTChar, LTRect, LTLine, LTAnno, LTCurve
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

from pprint import pprint

turbotlib.log("Starting run...")  # Optional debug logging

URL_WITH_PDF_LINKS = 'http://www.ocif.gobierno.pr/concesionariosbusqueda_eng.htm'

# Basic idea of the pdf parser is from https://blog.scraperwiki.com/2012/06/pdf-table-extraction-of-a-table/

config = {
    u'documents/cons/IA.pdf': {
        'enabled':
        False,
        'unique_column_name':
        'LIC.NUM.',
        'name_column_name':
        u'NAME',
        'merge':
        False,
Example #42
0
	if (len(naughty_object_list) > 0):
		print("")
		print_objects(naughty_object_list)

#print all the unique values for a given field
def print_field(objects, field):
	field_values = []
	for item in objects:
		if (field in item):
			if (item[field] not in field_values):
				field_values.append(item[field])
	for value in field_values:
		print(value)

#START DOING STUFF
turbotlib.log("Starting run on " + sample_date + "...") # Optional debug logging

#SOURCE URLS

#list of links to navigation pages with a list of regulated entities. Stored as a list of these: [url, description]
detailURLs = [
	["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/commercial-and-savings-banks", "Commercial and Savings Banks in Slovakia"],
	["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/branch-offices-of-foreign-banks/banks", "Branch offices of Foreign Banks in Slovakia"],
	["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/branch-offices-of-foreign-banks/credit-cooperatives", "Branch offices of Foreign Credit Cooperatives in Slovakia"],
	["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/branch-offices-of-slovak-banks-operating-abroad", "Branch offices of Slovak Banks operating aboard"],
	["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/slovak-banks-providers-of-services-on-the-cross-border-basis-abroad", "Slovak banks providing services on the cross-border basis abroad"],
	["http://www.nbs.sk/en/financial-market-supervision/banking-sector-supervision/List-of-credit-institutions/banks-in-special-proceedings", "Banks in special proceedings"]
]

#list of links to pages which contain details of more than one regulated entity. Stored as a list of these [url, description, whether the entity name is in an h4 above the table]
multipleDetailURLs = [