def test_transform_feature(self): def transformation_function(row, table): if row.percent_column is None or row.percent_column < 0.1269: return None # discard this row new = row._asdict() new['meta'] = ', '.join([ '{} => {}'.format(key, value) for key, value in table._meta.items() ]) return new fields = utils.table.fields.copy() fields.update({ 'meta': rows.fields.TextField, }) tables = [utils.table] * 3 result = rows.transform(fields, transformation_function, *tables) self.assertEqual(result.fields, fields) not_discarded = [ transformation_function(row, utils.table) for row in utils.table ] * 3 not_discarded = [row for row in not_discarded if row is not None] self.assertEqual(len(result), len(not_discarded)) for expected_row, row in zip(not_discarded, result): self.assertEqual(expected_row, dict(row._asdict()))
def search_router_database(query): response = requests.post(URL_ROUTER_SEARCH, data={'action': 'routerList', 'criteria': query, 'site': 'drupal', }) table = rows.import_from_html(BytesIO(response.content), encoding=response.encoding, properties=True) fields = OrderedDict() fields['id'] = rows.fields.IntegerField for field_name in FIELD_NAMES: if field_name in table.fields: fields[field_name] = table.fields[field_name] return rows.transform(fields, transform_row, table)
def router_images(router_id): html = _router_detail(router_id) table = rows.import_from_html(BytesIO(html), index=1, preserve_html=True) fields = OrderedDict([('date', rows.fields.DateField), ('filename', rows.fields.TextField), ('url', rows.fields.TextField), ('size', rows.fields.TextField), ('description', rows.fields.TextField)]) def transform(row, table): file_data = tag_to_dict(row.filename) absolute_url = url_join(URL_ROUTER_SEARCH, url_quote(file_data['href'])) return {'date': extract_text(row.date), 'description': extract_text(row.description), 'filename': file_data['text'], 'size': extract_text(row.size), 'url': absolute_url, } return rows.transform(fields, transform, table)
def test_transform_feature(self): def transformation_function(row, table): if row.percent_column < 0.1269: return None # discard this row new = row._asdict() new['meta'] = ', '.join(['{} => {}'.format(key, value) for key, value in table._meta.items()]) return new fields = utils.table.fields.copy() fields.update({'meta': rows.fields.UnicodeField}) tables = [utils.table] * 3 result = rows.transform(fields, transformation_function, *tables) self.assertEqual(result.fields, fields) not_discarded = [transformation_function(row, utils.table) for row in utils.table] * 3 not_discarded = [row for row in not_discarded if row is not None] self.assertEqual(len(result), len(not_discarded)) for expected_row, row in zip(not_discarded, result): self.assertEqual(expected_row, dict(row._asdict()))
import rows extract_links = rows.plugins.html.extract_links extract_text = rows.plugins.html.extract_text # Get the HTML url = "http://wnpp.debian.net/" response = requests.get(url) html = response.content # Import data, preserving cell's HTML packages = rows.import_from_html(BytesIO(html), index=10, preserve_html=True) def transform(row, table): 'Extract links from "project" field and remove HTML from all' data = row._asdict() data["links"] = " ".join(extract_links(row.project)) for key, value in data.items(): if isinstance(value, six.text_type): data[key] = extract_text(value) return data new_fields = packages.fields.copy() new_fields["links"] = rows.fields.TextField packages = rows.transform(new_fields, transform, packages) rows.export_to_csv(packages, "debian-wnpp.csv")
city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil' response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath(BytesIO(html), rows_xpath='//table/tr/td/ul/li', fields_xpath=OrderedDict([ ('name', './/text()'), ('link', './/a/@href') ])) regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)') def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link']) data['name'], data['state'] = regexp_city_state.findall(data['name'])[0] return data new_fields = OrderedDict() new_fields['name'] = cities.fields['name'] new_fields['state'] = rows.fields.TextField # new field new_fields['link'] = cities.fields['link'] cities = rows.transform(new_fields, transform, cities) rows.export_to_csv(cities, 'brazilian-cities.csv')
# Get data from Portuguese Wikipedia city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil" response = requests.get(city_list_url) html = response.content # Extract desired data using XPath cities = rows.import_from_xpath( BytesIO(html), rows_xpath="//table/tr/td/ul/li", fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]), ) regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)") def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data["link"] = urljoin("https://pt.wikipedia.org", data["link"]) data["name"], data["state"] = regexp_city_state.findall(data["name"])[0] return data new_fields = OrderedDict() new_fields["name"] = cities.fields["name"] new_fields["state"] = rows.fields.TextField # new field new_fields["link"] = cities.fields["link"] cities = rows.transform(new_fields, transform, cities) rows.export_to_csv(cities, "brazilian-cities.csv")