Exemple #1
0
    def test_transform_feature(self):
        def transformation_function(row, table):
            if row.percent_column is None or row.percent_column < 0.1269:
                return None  # discard this row

            new = row._asdict()
            new['meta'] = ', '.join([
                '{} => {}'.format(key, value)
                for key, value in table._meta.items()
            ])
            return new

        fields = utils.table.fields.copy()
        fields.update({
            'meta': rows.fields.TextField,
        })
        tables = [utils.table] * 3
        result = rows.transform(fields, transformation_function, *tables)
        self.assertEqual(result.fields, fields)
        not_discarded = [
            transformation_function(row, utils.table) for row in utils.table
        ] * 3
        not_discarded = [row for row in not_discarded if row is not None]
        self.assertEqual(len(result), len(not_discarded))

        for expected_row, row in zip(not_discarded, result):
            self.assertEqual(expected_row, dict(row._asdict()))
Exemple #2
0
def search_router_database(query):
    response = requests.post(URL_ROUTER_SEARCH,
                             data={'action': 'routerList',
                                   'criteria': query,
                                   'site': 'drupal', })
    table = rows.import_from_html(BytesIO(response.content),
                                  encoding=response.encoding,
                                  properties=True)

    fields = OrderedDict()
    fields['id'] = rows.fields.IntegerField
    for field_name in FIELD_NAMES:
        if field_name in table.fields:
            fields[field_name] = table.fields[field_name]

    return rows.transform(fields, transform_row, table)
Exemple #3
0
def router_images(router_id):
    html = _router_detail(router_id)
    table = rows.import_from_html(BytesIO(html), index=1, preserve_html=True)
    fields = OrderedDict([('date', rows.fields.DateField),
                          ('filename', rows.fields.TextField),
                          ('url', rows.fields.TextField),
                          ('size', rows.fields.TextField),
                          ('description', rows.fields.TextField)])

    def transform(row, table):
        file_data = tag_to_dict(row.filename)
        absolute_url = url_join(URL_ROUTER_SEARCH,
                                url_quote(file_data['href']))
        return {'date': extract_text(row.date),
                'description': extract_text(row.description),
                'filename': file_data['text'],
                'size': extract_text(row.size),
                'url': absolute_url, }

    return rows.transform(fields, transform, table)
Exemple #4
0
    def test_transform_feature(self):

        def transformation_function(row, table):
            if row.percent_column < 0.1269:
                return None  # discard this row

            new = row._asdict()
            new['meta'] = ', '.join(['{} => {}'.format(key, value)
                                     for key, value in table._meta.items()])
            return new

        fields = utils.table.fields.copy()
        fields.update({'meta': rows.fields.UnicodeField})
        tables = [utils.table] * 3
        result = rows.transform(fields, transformation_function, *tables)
        self.assertEqual(result.fields, fields)
        not_discarded = [transformation_function(row, utils.table)
                         for row in utils.table] * 3
        not_discarded = [row for row in not_discarded if row is not None]
        self.assertEqual(len(result), len(not_discarded))

        for expected_row, row in zip(not_discarded, result):
            self.assertEqual(expected_row, dict(row._asdict()))
Exemple #5
0
import rows

extract_links = rows.plugins.html.extract_links
extract_text = rows.plugins.html.extract_text

# Get the HTML
url = "http://wnpp.debian.net/"
response = requests.get(url)
html = response.content

# Import data, preserving cell's HTML
packages = rows.import_from_html(BytesIO(html), index=10, preserve_html=True)


def transform(row, table):
    'Extract links from "project" field and remove HTML from all'

    data = row._asdict()
    data["links"] = " ".join(extract_links(row.project))
    for key, value in data.items():
        if isinstance(value, six.text_type):
            data[key] = extract_text(value)
    return data


new_fields = packages.fields.copy()
new_fields["links"] = rows.fields.TextField
packages = rows.transform(new_fields, transform, packages)

rows.export_to_csv(packages, "debian-wnpp.csv")
city_list_url = 'https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil'
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(BytesIO(html),
                                rows_xpath='//table/tr/td/ul/li',
                                fields_xpath=OrderedDict([
                                    ('name', './/text()'),
                                    ('link', './/a/@href')
                                ]))

regexp_city_state = re.compile(r'(.*) \(([A-Z]{2})\)')


def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link'])
    data['name'], data['state'] = regexp_city_state.findall(data['name'])[0]
    return data


new_fields = OrderedDict()
new_fields['name'] = cities.fields['name']
new_fields['state'] = rows.fields.TextField  # new field
new_fields['link'] = cities.fields['link']
cities = rows.transform(new_fields, transform, cities)
rows.export_to_csv(cities, 'brazilian-cities.csv')
# Get data from Portuguese Wikipedia
city_list_url = "https://pt.wikipedia.org/wiki/Lista_de_munic%C3%ADpios_do_Brasil"
response = requests.get(city_list_url)
html = response.content

# Extract desired data using XPath
cities = rows.import_from_xpath(
    BytesIO(html),
    rows_xpath="//table/tr/td/ul/li",
    fields_xpath=OrderedDict([("name", ".//text()"), ("link", ".//a/@href")]),
)

regexp_city_state = re.compile(r"(.*) \(([A-Z]{2})\)")


def transform(row, table):
    'Transform row "link" into full URL and add "state" based on "name"'

    data = row._asdict()
    data["link"] = urljoin("https://pt.wikipedia.org", data["link"])
    data["name"], data["state"] = regexp_city_state.findall(data["name"])[0]
    return data


new_fields = OrderedDict()
new_fields["name"] = cities.fields["name"]
new_fields["state"] = rows.fields.TextField  # new field
new_fields["link"] = cities.fields["link"]
cities = rows.transform(new_fields, transform, cities)
rows.export_to_csv(cities, "brazilian-cities.csv")