def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") lis = doc.find_all("li", class_="card card--horizontal") for li in lis: name = get_name(li.find("span", class_="card__name").get_text()) try: mydate = get_date( li.find("span", class_="card__date").get_text()) except AttributeError: mydate = '' try: place = li.find("span", class_="card__place").get_text() except AttributeError: place = "" picture_url = urljoin(_site_url, li.find("img")["src"]) fields = [ { "name": "Date", "value": mydate }, { "name": "Place", "value": place }, { "tag": "picture_url", "value": picture_url }, ] try: entity_url = urljoin(_entity_base_url, li.find("a", class_="card__box")["href"]) doc2 = BeautifulSoup(helpers.fetch_string(entity_url), "html.parser") div = doc2.find("div", class_="article__text") ps = div.find_all("p") header = ps[0].get_text().strip() text = ' '.join([p.get_text().strip() for p in ps[1:]]) fields.append({"name": header, "value": text}) except TypeError: entity_url = '' fields.append({"tag": "url", "value": entity_url}) yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") div = doc.find('div', id="print_content") u = div.find("u").get_text().strip() name = u.split(':')[-1] year = u.split(':')[0][-4:] p = div.find_all("p")[1] text = p.get_text().strip() fields = [ {"tag": "url", "value": url}, {"tag": "text", "value": text}, {"tag": "year", "value": year} ] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def _custom_opener(url, linux=False): if linux: return BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) else: from urllib2 import urlopen return BeautifulSoup(urlopen(url).read())
def _generate_entities(): for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) # setup entity entity = { "_meta": { "id": re.sub(".*AppealId=", "", url), "entity_type": "person" }, "types": ["warning"], "fields": [] } # load details into fields for dt in doc.find("dl", class_="details").find_all("dt"): dd = dt.find_next_sibling("dd") if "suspect name" in dt.get_text().lower(): entity["name"] = " ".join([w for w in dd.get_text().split() if re.match("^[a-zA-Z]+$", w)]) else: entity["fields"].append({"name": dt.get_text().strip(), "value": dd.get_text().strip()}) # load "full text" section for h3 in doc.find("div", class_="summary").find_all("h3"): p = h3.find_next_sibling("h3") if p is not None and "Full Text" in h3.get_text(): entity["fields"].append({"name": "Summary", "value": p.get_text()}) # give back entity if entity.get("name", "").strip().lower() not in ["", "unknown"]: yield entity
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") div = doc.find('div', id="print_content") u = div.find("u").get_text().strip() name = u.split(':')[-1] year = u.split(':')[0][-4:] p = div.find_all("p")[1] text = p.get_text().strip() fields = [{ "tag": "url", "value": url }, { "tag": "text", "value": text }, { "tag": "year", "value": year }] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def _custom_opener(url, linux=False): if linux: return BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) else: from urllib2 import urlopen return BeautifulSoup(urlopen(url))
def get_tables(url): objects = {'objects': []} main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) # main_page = BeautifulSoup(urlopen(url)) tables = main_page.find_all('table') def __fill_helper(_tag): table_object['instance'].append({CUSTOM_TAG: _bs_to_utf(_tag), 'people': []}) for table in tables: table_object = {'instance': []} rows = table.find_all('tr') for row in rows: tds = row.find_all('td') len_tds = len(tds) if len_tds: if len_tds > 1: p_name = _bs_to_utf(tds[1]) p_name = re.sub("^\s+", "", p_name.split(".")[-1].strip()) person_info = {POSITION: _bs_to_utf(tds[0]), 'person_name': p_name} if tds[0].find('a'): person_info.update({'person_url': _site_ulr + tds[0].find('a')['href']}) if CUSTOM_TAG in table_object['instance'][-1]: table_object['instance'][-1]['people'].append(person_info) else: __fill_helper(tds[0]) else: __fill_helper(row) objects['objects'].append(table_object) return objects
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") lis = doc.find_all("li", class_="card card--horizontal") for li in lis: name = get_name(li.find("span", class_="card__name").get_text()) try: mydate = get_date(li.find("span", class_="card__date").get_text()) except AttributeError: mydate = '' try: place = li.find("span", class_="card__place").get_text() except AttributeError: place = "" picture_url = urljoin(_site_url, li.find("img")["src"]) fields = [ {"name": "Date", "value": mydate}, {"name": "Place", "value": place}, {"tag": "picture_url", "value": picture_url}, ] try: entity_url = urljoin(_entity_base_url, li.find("a", class_="card__box")["href"]) doc2 = BeautifulSoup(helpers.fetch_string(entity_url), "html.parser") div = doc2.find("div", class_="article__text") ps = div.find_all("p") header = ps[0].get_text().strip() text = ' '.join([p.get_text().strip() for p in ps[1:]]) fields.append({"name": header, "value": text}) except TypeError: entity_url = '' fields.append({"tag": "url", "value": entity_url}) yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") table = doc.find('table', class_='views-table').find('tbody') trs = table.find_all('tr') for tr in trs: td = tr.find_all('td') href = td[0].find_all('a')[1]['href'] name = td[1].get_text().strip() matter_type = td[2].get_text().strip() matter_type = " ".join([word.capitalize() for word in matter_type.split()]) date_failed = td[3].get_text().strip() date_failed = "{}-{}-{}".format(date_failed[:4], date_failed[4:6], date_failed[6:]) fields = [{"name": "Matter Type", "value": matter_type}, {"name": "Docket Number", "value": href}, {"name": "Date Failed", "value": date_failed}] names = _get_name(name): if len(names) > 1: name = names[0] aka = [] for aka_name in names: aka.append({'name': aka_name}) else: name = names[0] my_id = helpers.make_id(new_name) if len(my_id) > 99: my_id = my_id[:99] if any(word in name for word in company): entity_type = "company" else: entity_type = "person" if aka: yield { "_meta": { "id": my_id, "entity_type": entity_type }, "fields": fields, "aka": aka, "name": name, } else: yield { "_meta": { "id": my_id, "entity_type": entity_type }, "fields": fields, "name": name, }
def _generate_entities(url, name, office, years_active=None, parish=None): """for each scrapable page, yield an entity""" doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") img = urljoin(_site_url, doc.find("img", class_="ms-rteImage-2")['src']) h3 = doc.find_all('h3', class_="ms-rteElement-H3") div = h3[0].find_next_sibling() current = 'Current Posts ' + div.get_text().strip() while div.name != 'h3': div = div.find_next_sibling() if div: current += ' ' + div.get_text().strip() else: break current = current.replace('dateMember', 'date Member') fields = [ {"tag": "url", "value": url}, {"tag": "Current Posts", "value": current}, {"tag": "picture_url", "value": img}, {"tag": "Office", "value": office}, ] if years_active: fields.append({"tag": "Years Active", "value": years_active}) if parish: fields.append({"tag": "Parish", "value": parish}) try: p = h3[1].find_next_sibling() career = p.get_text().strip() while p.name == 'p': p = p.find_next_sibling() career += ' ' + p.get_text().strip() fields.append({"tag": "Parliamentary Career", "value": career}) except IndexError: pass ps = doc.find_all("p", class_="ms-rteElement-P") for p in ps: p = p.get_text().strip() if 'Born' in p: fields.append({"tag": "date_of_birth", "value": get_date(p.split(':')[-1].strip())}) elif 'Parents' in p: fields.append({"tag": "Parents", "value": p.split(':')[-1].strip()}) return { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") section = doc.find('section', id="pageContent") h1 = section.find("h1").get_text().strip() if '(' in h1: h1_without_bracket = h1.split('(')[0] + h1.split(')')[-1] h1_without_bracket = h1_without_bracket.strip() else: h1_without_bracket = h1 names1 = h1_without_bracket.split(',') names2 = [] for name in names1: for new_name in divide_name(name, ' & '): names2.append(new_name) new_names = [] for name in names2: for new_name in divide_name(name, ' and '): new_names.append(new_name) text = section.find("p").get_text().strip() fields = [ {"tag": "url", "value": url}, {"name": "text", "value": text} ] custom_fields = section.find_all("h2") for custom_field in custom_fields: field_name = custom_field.get_text().strip() if field_name == 'Defendants': values1 = section.find_all('div', class_="chargeDefendant") values2 = section.find_all('div', class_="chargeCharge") values = zip(values1, values2) field_value = ' '.join([value[0].get_text().strip() + ' ' + value[1].get_text().strip() for value in values]) else: field_value = custom_field.find_next_sibling('p').get_text().strip() fields.append({"tag": field_name, "value": field_value}) for name in new_names: name = name.strip() yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }
def get_rows(url): objects = [] main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) rows = main_page.find('table').find_all('tr') for row in rows[1:]: name, _url = row.find_all('td')[:2] obj = {'name': _bs_to_utf(name), 'picture_url': _site_ulr + _url.find('img')['src'] if _url.find('img') else _bs_to_utf(_url)} objects.append(obj) return objects
def _custom_opener(url, linux=True): if linux: return BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) else: from urllib2 import urlopen try: return BeautifulSoup(urlopen(url).read()) except Exception, e: print e pass
def _custom_opener(url, linux=False): if linux: return BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) else: from urllib2 import urlopen try: return BeautifulSoup(urlopen(url).read()) except Exception, e: print e pass
def _get_scrape_urls(): for page in itertools.count(1): doc = BeautifulSoup(helpers.fetch_string(_base_url.format(page), cache_hours=6)) # find all matching tags, bail if no more found_urls = False for a in doc.find_all("a"): if a.has_attr("href") and "most-wanted-detail" in a["href"]: yield urlparse.urljoin(_base_url, a["href"]) found_urls = True if not found_urls: break
def _generate_entities(): """for each scrapable page, yield an entity""" run = True page = 0 while run: url = _base_url + str(page) page += 1 doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") div = doc.find('div', id="resultsSearchBox") all_h3 = div.find_all("h3", id='') if not all_h3: run = False return for h3 in all_h3: a = h3.find('a') href = urljoin(_site_url, a['href']) name = a.get_text().split(':')[1].strip() sub = h3.find_next_sibling('sub') spans = sub.find_all('span') if spans: published = get_date(spans[0].get_text().strip()) modified = get_date(spans[1].get_text().strip()) else: sub = sub.get_text().strip() published = get_date(sub[11:21]) modified = get_date(sub[-10:]) if any(company in name.lower() for company in companies): entity_type = "company" else: entity_type = "person" fields = [{ "tag": "url", "value": href }, { "tag": "Published", "value": published }, { "tag": "Last Modified", "value": modified }] yield { "_meta": { "id": helpers.make_id(name), "entity_type": entity_type }, "fields": fields, "name": name }
def _get_scrape_urls(): """find all scrapable links on main page""" for url in _base_url, _base_url2: doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") uls = doc.find_all("ul", class_="contentListing") for ul in uls: href = ul.find_all("a") for link in href: if link: yield link['href']
def _get_parties(url): party_objects = [] main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) table_party = main_page.find('table', {'class': 'telbogTable'}).find_all('tr')[1:-1] for row in table_party: party_url = _site_ulr + row.find('a')['href'] party_id = row.find('a')['href'].split('=')[1].replace('{', '').replace('}', '') party_name = row.find('a').text.encode('utf8') party_objects.append({'party_url': party_url, 'party_id': party_id, 'party_name': party_name}) return party_objects
def get_rows(url): objects = [] main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) rows = main_page.find('table').find_all('tr') for row in rows[1:]: name, _url = row.find_all('td')[:2] obj = { 'name': _bs_to_utf(name), 'picture_url': _site_ulr + _url.find('img')['src'] if _url.find('img') else _bs_to_utf(_url) } objects.append(obj) return objects
def _get_scrape_urls(): """find all scrapable links on main page""" url = _base_url doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") div = doc.find("div", id="print_content") uls = div.find_all('ul') for ul in uls: href = ul.find_all("a") for link in href: link = link['href'] link = urljoin(_site_url, link) yield link
def _generate_entities(data): """for each scrapable page, yield an entity""" i = 0 while i < len(data): release_date = datetime.strptime(data[i].text, '%m/%d/%Y') release_date = release_date.strftime('%Y-%m-%d') name = data[i + 1].text url = data[i + 1].find_element_by_tag_name('a').get_attribute("/href") href = data[i + 2].find_element_by_tag_name('a').get_attribute("/href") related = [] if href: doc = BeautifulSoup(helpers.fetch_string(href), "html.parser") tds = doc.find_all("td", class_='ms-vb') for td in tds: try: related.append(td.find('a')['href']) except AttributeError: pass related_documents = ' '.join(related) fields = [{ "name": "Release date", "value": release_date }, { "tag": "url", "value": url }, { "name": "Related documents", "value": related_documents }] i += 3 my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] entity = { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } helpers.emit(entity)
def custom_opener(url, linux=True): """ While using WINDOWS use linux=False parameter, but before final contribute change in to linux=True :param url: input url :param linux: switch between linux or windows """ from bs4 import BeautifulSoup from helpers import fetch_string if linux: return BeautifulSoup(fetch_string(url, cache_hours=6)) else: from urllib2 import urlopen try: return BeautifulSoup(urlopen(url).read()) except Exception, e: print e pass
def get_tables(url): objects = {'objects': []} main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) # main_page = BeautifulSoup(urlopen(url)) tables = main_page.find_all('table') def __fill_helper(_tag): table_object['instance'].append({ CUSTOM_TAG: _bs_to_utf(_tag), 'people': [] }) for table in tables: table_object = {'instance': []} rows = table.find_all('tr') for row in rows: tds = row.find_all('td') len_tds = len(tds) if len_tds: if len_tds > 1: p_name = _bs_to_utf(tds[1]) p_name = re.sub("^\s+", "", p_name.split(".")[-1].strip()) person_info = { POSITION: _bs_to_utf(tds[0]), 'person_name': p_name } if tds[0].find('a'): person_info.update({ 'person_url': _site_ulr + tds[0].find('a')['href'] }) if CUSTOM_TAG in table_object['instance'][-1]: table_object['instance'][-1]['people'].append( person_info) else: __fill_helper(tds[0]) else: __fill_helper(row) objects['objects'].append(table_object) return objects
def _generate_entities(data): """for each scrapable page, yield an entity""" i = 0 while i < len(data): release_date = datetime.strptime(data[i].text, '%m/%d/%Y') release_date = release_date.strftime('%Y-%m-%d') name = data[i+1].text url = data[i+1].find_element_by_tag_name('a').get_attribute("/href") href = data[i+2].find_element_by_tag_name('a').get_attribute("/href") related = [] if href: doc = BeautifulSoup(helpers.fetch_string(href), "html.parser") tds = doc.find_all("td", class_='ms-vb') for td in tds: try: related.append(td.find('a')['href']) except AttributeError: pass related_documents = ' '.join(related) fields = [{"name": "Release date", "value": release_date}, {"tag": "url", "value": url}, {"name": "Related documents", "value": related_documents}] i += 3 my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] entity = { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } helpers.emit(entity)
def custom_opener(url): """ While using WINDOWS use linux=False parameter, but before final contribute change in to linux=True :param url: input url :param linux: switch between linux or windows """ import platform from helpers import fetch_string _OS_LINUX = True if "linux" in platform.system().lower() or 'unix' in platform.system().lower() else False if _OS_LINUX: return BeautifulSoup(fetch_string(url, cache_hours=6)) else: from urllib2 import urlopen try: return BeautifulSoup(urlopen(url, timeout=20).read()) except Exception, e: print e
def _get_parties(url): party_objects = [] main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6)) table_party = main_page.find('table', { 'class': 'telbogTable' }).find_all('tr')[1:-1] for row in table_party: party_url = _site_ulr + row.find('a')['href'] party_id = row.find('a')['href'].split('=')[1].replace('{', '').replace( '}', '') party_name = row.find('a').text.encode('utf8') party_objects.append({ 'party_url': party_url, 'party_id': party_id, 'party_name': party_name }) return party_objects
def _generate_entities(url, name, office, years_active=None, parish=None): """for each scrapable page, yield an entity""" doc = BeautifulSoup(helpers.fetch_string(url), "html.parser") img = urljoin(_site_url, doc.find("img", class_="ms-rteImage-2")['src']) h3 = doc.find_all('h3', class_="ms-rteElement-H3") div = h3[0].find_next_sibling() current = 'Current Posts ' + div.get_text().strip() while div.name != 'h3': div = div.find_next_sibling() if div: current += ' ' + div.get_text().strip() else: break current = current.replace('dateMember', 'date Member') fields = [ { "tag": "url", "value": url }, { "tag": "Current Posts", "value": current }, { "tag": "picture_url", "value": img }, { "tag": "Office", "value": office }, ] if years_active: fields.append({"tag": "Years Active", "value": years_active}) if parish: fields.append({"tag": "Parish", "value": parish}) try: p = h3[1].find_next_sibling() career = p.get_text().strip() while p.name == 'p': p = p.find_next_sibling() career += ' ' + p.get_text().strip() fields.append({"tag": "Parliamentary Career", "value": career}) except IndexError: pass ps = doc.find_all("p", class_="ms-rteElement-P") for p in ps: p = p.get_text().strip() if 'Born' in p: fields.append({ "tag": "date_of_birth", "value": get_date(p.split(':')[-1].strip()) }) elif 'Parents' in p: fields.append({ "tag": "Parents", "value": p.split(':')[-1].strip() }) return { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name }
def _get_scrape_urls(): """find all scrapable links on main page""" doc = BeautifulSoup(helpers.fetch_string(_base_url), "html.parser") council = doc.find('div', id='div_938d72bb-8154-4b6a-bd71-8144ca6bf1a0') house = doc.find('div', id='div_47793ec9-3449-46a3-9095-f2eb8c475846') council_lis = council.find_all("div", class_="link-item") house_lis = house.find_all("li", class_="dfwp-item") for li in council_lis: person = li.find("a") link = person["href"] name = _get_name(person.get_text()) office = "Legislative Council" entity = _generate_entities(link, name, office) yield entity for li in house_lis: try: parish = li.find("div", class_="groupheader").get_text().strip() except AttributeError: continue all_div = li.find_all("div", class_="link-item") for div in all_div: person = div.find("a") link = person["href"] name = _get_name(person.get_text()) office = "House of Keys" years_active = None entity = _generate_entities(link, name, office, years_active, parish) yield entity doc = BeautifulSoup(helpers.fetch_string(_base_url2), "html.parser") div = doc.find('div', id='div_a1526572-2de9-494b-a410-6fdc17d3b84e') trs = div.find_all('tr', class_='ms-itmhover') for tr in trs: try: td = tr.find_all('td') name = _get_name(td[1].get_text()) office = "House of Keys" link = urljoin(_site_url, td[3].find('a')['href']) years_active = td[2].get_text().strip() try: date = int(years_active.split()[-1]) if date < 1990: continue except ValueError: pass if '.pdf' in link: fields = [{ "tag": "url", "value": link }, { "tag": "Years Active", "value": years_active }, { "tag": "Office", "value": office }] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name } continue entity = _generate_entities(link, name, office, years_active) yield entity except TypeError: pass
def _get_scrape_urls(): """find all scrapable links on main page""" doc = BeautifulSoup(helpers.fetch_string(_base_url), "html.parser") council = doc.find('div', id='div_938d72bb-8154-4b6a-bd71-8144ca6bf1a0') house = doc.find('div', id='div_47793ec9-3449-46a3-9095-f2eb8c475846') council_lis = council.find_all("div", class_="link-item") house_lis = house.find_all("li", class_="dfwp-item") for li in council_lis: person = li.find("a") link = person["href"] name = _get_name(person.get_text()) office = "Legislative Council" entity = _generate_entities(link, name, office) yield entity for li in house_lis: try: parish = li.find("div", class_="groupheader").get_text().strip() except AttributeError: continue all_div = li.find_all("div", class_="link-item") for div in all_div: person = div.find("a") link = person["href"] name = _get_name(person.get_text()) office = "House of Keys" years_active = None entity = _generate_entities(link, name, office, years_active, parish) yield entity doc = BeautifulSoup(helpers.fetch_string(_base_url2), "html.parser") div = doc.find('div', id='div_a1526572-2de9-494b-a410-6fdc17d3b84e') trs = div.find_all('tr', class_='ms-itmhover') for tr in trs: try: td = tr.find_all('td') name = _get_name(td[1].get_text()) office = "House of Keys" link = urljoin(_site_url, td[3].find('a')['href']) years_active = td[2].get_text().strip() try: date = int(years_active.split()[-1]) if date < 1990: continue except ValueError: pass if '.pdf' in link: fields = [ {"tag": "url", "value": link}, {"tag": "Years Active", "value": years_active}, {"tag": "Office", "value": office} ] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "person" }, "fields": fields, "name": name } continue entity = _generate_entities(link, name, office, years_active) yield entity except TypeError: pass
def _generate_entities(): """for each scrapable page, yield an entity""" doc = BeautifulSoup(helpers.fetch_string(_base_url[0]), "html.parser") form = doc.find('form', {'name': 'criminalqueryeng_p2'}) tables = form.find_all('table', {'bgcolor': '#84BD00'}) tr = tables[0].find_all('tr') i = 1 while i < len(tr): td = tr[i].find_all('td') name = _get_name(td[2].get_text().strip()) date_filing = _get_date(td[1].get_text().strip()) try: url = td[6].find('a')['href'] except TypeError: url = '' summarized_facts = td[5].get_text().strip() fields = [{"name": "Summarized Facts", "value": summarized_facts}, {"name": "Press Release", "value": url}, {"name": "Date of Complaint Filing", "value": date_filing}] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name, } i += 2 tr = tables[1].find_all('tr') i = 1 while i < len(tr): td = tr[i].find_all('td') name = _get_name(td[4].get_text().strip()) date_filing = _get_date(td[1].get_text().strip()) try: url = td[8].find('a')['href'] except TypeError: url = '' summarized_facts = td[7].get_text().strip() baht = td[9].get_text().strip() section = td[5].get_text().strip() law = td[6].get_text().strip() nomer = td[3].get_text().strip() fields = [{"name": "Summarized Facts", "value": summarized_facts}, {"name": "Press Release", "value": url}, {"name": "Date of Complaint Filing", "value": date_filing}, {"name": "Amount of Fines (Baht)", "value": baht}, {"name": "Section", "value": section}, {"name": "Relevant Law", "value": law}, {"name": "Order Number", "value": nomer}, ] yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name, } i += 2 doc = BeautifulSoup(helpers.fetch_string(_base_url[1]), "html.parser") tr = doc.find_all('tr') i = 0 while i < len(tr): try: td=tr[i].find_all('td') name = _get_name(td[1].get_text().strip()) type_personal = td[2].get_text().strip() try: url = td[3].find('a')['href'] except TypeError: url = '' summarized_facts = td[4].get_text().strip() administrative_orders = td[5].get_text().strip() effective_date = td[6].get_text().strip() fields = [{"name": "Type of Personal", "value": type_personal}, {"name": "Press Release", "value": url}, {"name": "Date of Complaint Filing", "value": date_filing}, {"name": "Administrative Orders", "value": administrative_orders}, {"name": "Summarized Facts", "value": summarized_facts}, {"name": "Effective Date", "value": effective_date}, ] my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] yield { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } i += 2 except: i += 1 doc = BeautifulSoup(helpers.fetch_string(_base_url[2]), "html.parser") tr = doc.find_all('tr') i = 0 while i < len(tr): try: td=tr[i].find_all('td') name = _get_name(td[3].get_text().strip()) sanction = _get_date(td[1].get_text().strip()) summarized_facts = td[7].get_text().strip() nomer = td[2].get_text().strip() types_business = td[4].get_text().strip() relevant_law = td[5].get_text().strip() section = td[6].get_text().strip() baht = td[10].get_text().strip() fields = [{"name": "Date of Imposing the Administrative Sanction", "value": sanction}, {"name": "Types of Business", "value": types_business}, {"name": "Summarized Facts", "value": summarized_facts}, {"name": "Order Number", "value": nomer}, {"name": "Relevant Law", "value": relevant_law}, {"name": "Section", "value": section}, {"name": "Amount of Fines (Baht)", "value": baht}, ] my_id = helpers.make_id(name) if len(my_id) > 99: my_id = my_id[:99] yield { "_meta": { "id": my_id, "entity_type": "company" }, "fields": fields, "name": name, } i += 2 except: i += 1
def _get_people(party_obj): people_obj = [] for party in party_obj: modified_url = party['party_url'] + PAGINATION modified_url = quote(modified_url, safe=SAFE_QUOTE) page_object = BeautifulSoup(helpers.fetch_string(modified_url, cache_hours=6)) table_party = page_object.find('table', {'class': 'telbogTable'}).find_all('tr')[1:] for person in table_party: person_url = _site_ulr + person['onclick'].replace('document.location=(\'', '').replace('\')', '') person_id = person_url.split('/')[-1].strip('.aspx') all_td = person.find_all('td') person_name = ' '.join([_.text for _ in all_td[:2]]) position = all_td[3].text phone = all_td[4].text.split(':')[-1].strip() try: profile_pic = _site_ulr + '/' + all_td[-1].find('img')['src'] except TypeError: profile_pic = None people_entity = _create_entity() people_entity['_meta']['entity_type'] = 'person' people_entity['name'] = person_name conc_names = person_name + position + person_id people_entity['_meta']['id'] = _create_id(conc_names) fields = [{'tag': 'political_party', 'value': party['party_name']}, {'tag': 'url', 'value': person_url}, {'tag': 'position', 'value': position}, {'tag': 'phone_number', 'value': phone.strip('View biography')}, {'tag': 'country', 'value': 'Denmark'}, {'tag': 'person_name', 'value': person_name}] if profile_pic: fields.append( { 'tag': 'picture_url', 'value': profile_pic.replace('~/media/', 'Members/~/media/').replace('84', '133').replace('x84', 'x133') } ) open_person_url = BeautifulSoup(helpers.fetch_string(quote(person_url, safe=SAFE_QUOTE), cache_hours=6)) bio = open_person_url.find('div', {'class': 'tabContent clearfix'}) first_block = bio.find('p').text regexp_born_in_place = re.compile('born (.+),') regexp_born = re.compile(r'born (.+).') try: born_string = regexp_born_in_place.search(first_block).group(0).split(',')[0].strip('born ') except AttributeError: born_string = regexp_born.search(first_block).group(0).split(',')[0].split('.')[0].strip('born ') if 'in' in born_string or ' at ' in born_string or ' on ' in born_string: try: date, place = born_string.split(' in ') except ValueError: try: date, place = born_string.split(' at ') except ValueError: date, place = born_string.split(' on ') fields.append({'tag': 'date_of_birth', 'value': str(parser.parse(date)).split(' ')[0]}) fields.append({'tag': 'place_of_birth', 'value': place}) else: try: fields.append({'tag': 'date_of_birth', 'value': str(parser.parse(born_string)).split(' ')[0]}) except ValueError: fields.append( {'tag': 'date_of_birth', 'value': str(parser.parse(born_string.split('.')[0])).split(' ')[0]}) people_entity['fields'] = fields people_obj.append(people_entity) return people_obj
def _get_people(party_obj): people_obj = [] for party in party_obj: modified_url = party['party_url'] + PAGINATION modified_url = quote(modified_url, safe=SAFE_QUOTE) page_object = BeautifulSoup( helpers.fetch_string(modified_url, cache_hours=6)) table_party = page_object.find('table', { 'class': 'telbogTable' }).find_all('tr')[1:] for person in table_party: person_url = _site_ulr + person['onclick'].replace( 'document.location=(\'', '').replace('\')', '') person_id = person_url.split('/')[-1].strip('.aspx') all_td = person.find_all('td') person_name = ' '.join([_.text for _ in all_td[:2]]) position = all_td[3].text phone = all_td[4].text.split(':')[-1].strip() try: profile_pic = _site_ulr + '/' + all_td[-1].find('img')['src'] except TypeError: profile_pic = None people_entity = _create_entity() people_entity['_meta']['entity_type'] = 'person' people_entity['name'] = person_name conc_names = person_name + position + person_id people_entity['_meta']['id'] = _create_id(conc_names) fields = [{ 'tag': 'political_party', 'value': party['party_name'] }, { 'tag': 'url', 'value': person_url }, { 'tag': 'position', 'value': position }, { 'tag': 'phone_number', 'value': phone.strip('View biography') }, { 'tag': 'country', 'value': 'Denmark' }, { 'tag': 'person_name', 'value': person_name }] if profile_pic: fields.append({ 'tag': 'picture_url', 'value': profile_pic.replace('~/media/', 'Members/~/media/').replace( '84', '133').replace('x84', 'x133') }) open_person_url = BeautifulSoup( helpers.fetch_string(quote(person_url, safe=SAFE_QUOTE), cache_hours=6)) bio = open_person_url.find('div', {'class': 'tabContent clearfix'}) first_block = bio.find('p').text regexp_born_in_place = re.compile('born (.+),') regexp_born = re.compile(r'born (.+).') try: born_string = regexp_born_in_place.search(first_block).group( 0).split(',')[0].strip('born ') except AttributeError: born_string = regexp_born.search(first_block).group(0).split( ',')[0].split('.')[0].strip('born ') if 'in' in born_string or ' at ' in born_string or ' on ' in born_string: try: date, place = born_string.split(' in ') except ValueError: try: date, place = born_string.split(' at ') except ValueError: date, place = born_string.split(' on ') fields.append({ 'tag': 'date_of_birth', 'value': str(parser.parse(date)).split(' ')[0] }) fields.append({'tag': 'place_of_birth', 'value': place}) else: try: fields.append({ 'tag': 'date_of_birth', 'value': str(parser.parse(born_string)).split(' ')[0] }) except ValueError: fields.append({ 'tag': 'date_of_birth', 'value': str(parser.parse( born_string.split('.')[0])).split(' ')[0] }) people_entity['fields'] = fields people_obj.append(people_entity) return people_obj
def _generate_entities(): """for each scrapable page, yield an entity""" for url in _get_scrape_urls(): doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser") section = doc.find('section', id="pageContent") h1 = section.find("h1").get_text().strip() if '(' in h1: h1_without_bracket = h1.split('(')[0] + h1.split(')')[-1] h1_without_bracket = h1_without_bracket.strip() else: h1_without_bracket = h1 names1 = h1_without_bracket.split(',') names2 = [] for name in names1: for new_name in divide_name(name, ' & '): names2.append(new_name) new_names = [] for name in names2: for new_name in divide_name(name, ' and '): new_names.append(new_name) text = section.find("p").get_text().strip() fields = [{ "tag": "url", "value": url }, { "name": "text", "value": text }] custom_fields = section.find_all("h2") for custom_field in custom_fields: field_name = custom_field.get_text().strip() if field_name == 'Defendants': values1 = section.find_all('div', class_="chargeDefendant") values2 = section.find_all('div', class_="chargeCharge") values = zip(values1, values2) field_value = ' '.join([ value[0].get_text().strip() + ' ' + value[1].get_text().strip() for value in values ]) else: field_value = custom_field.find_next_sibling( 'p').get_text().strip() fields.append({"tag": field_name, "value": field_value}) for name in new_names: name = name.strip() yield { "_meta": { "id": helpers.make_id(name), "entity_type": "company" }, "fields": fields, "name": name }