Example #1
0
    def test_entrypoints(self):
        from rowgenerators.generator.iterator import IteratorSource
        from rowgenerators.generator.generator import GeneratorSource
        from rowgenerators.generator.csv import CsvSource

        us = 'http://public.source.civicknowledge.com/example.com/sources/unicode-utf8.csv'

        def g():
            yield None

        self.assertIsInstance(get_generator([]), IteratorSource)
        self.assertIsInstance(get_generator(g()), GeneratorSource)
        self.assertIsInstance(
            get_generator(parse_app_url(us).get_resource().get_target()),
            CsvSource)
Example #2
0
    def test_zip_file(self):

        us = 'metapack+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#metadata.csv'

        g = get_generator(us)

        print(list(g))
Example #3
0
    def test_fixed(self):
        from itertools import islice

        from rowgenerators import Table

        t = Table()
        t.add_column('id', int, 6)
        t.add_column('uuid', str, 34)
        t.add_column('int', int, 3)
        t.add_column('float', float, 14)

        print(str(t))

        parse = t.make_fw_row_parser()

        u = parse_app_url(
            'fixed+file:/Volumes/Storage/Downloads/test_data/fixed/simple-example.txt'
        )

        print(u.get_resource())
        print(u.get_resource().get_target())

        g = get_generator(u, table=t)

        print(type(g))

        for row in islice(g, 10):
            print(row)
Example #4
0
    def test_program(self):

        u = parse_app_url(script_path('rowgen.py'))
        u.scheme_extension = 'program'

        env = {
            '--long-arg': 'a',
            '-s': 'a',
            'ENV_VAR': 'a',
            'prop1': 'a',
            'prop2': 'a'
        }

        g = get_generator(u, working_dir=dirname(u.path), env=env)

        print(type(g))

        rows = {}

        for row in g.iter_rp:
            rows[row['type'] + '-' + row['k']] = row.v
            print(row)

        self.assertEqual('a', rows['prop-prop1'])
        self.assertEqual('{"prop1": "a", "prop2": "a"}',
                         rows['env-PROPERTIES'])
Example #5
0
    def run_row_intuit(url, cache):

        for encoding in ('ascii', 'utf8', 'latin1'):
            rows = list(islice(get_generator(url), 5000))
            return encoding, RowIntuiter().run(list(rows))

        raise Exception('Failed to convert with any encoding')
Example #6
0
    def geo(self):
        """Return a geopandas dataframe with boundaries for the area"""
        from publicdata.censusreporter.url import CensusReporterURL
        from rowgenerators import get_generator
        from itertools import islice
        from metapack.jupyter.pandas import MetatabDataFrame

        if isinstance(self._url, CensusReporterURL):
            geo_url = self._url.geo

            r = geo_url.get_resource()
            t = r.get_target()

            g = get_generator(t)

            headers = next(islice(g, 0, 1))
            data = islice(g, 1, None)

            df = MetatabDataFrame(list(data),
                                  columns=headers,
                                  metatab_resource=self)

            return df.geo

        else:
            raise PublicDataException(
                "Dataframe doesn't have a CensusReporterURL, so can't find geo source"
            )
Example #7
0
    def generator(self):

        from rowgenerators import get_generator

        ##
        ## Hack! This used to be
        ## target = self.get_resource().get_target().inner

        target = self.get_resource().get_target()

        return get_generator(target)
Example #8
0
    def get_row_generator(self, ref, cache=None):

        """Return a row generator for a reference"""
        from inspect import isgenerator
        from rowgenerators import get_generator

        g = get_generator(ref)

        if not g:
            raise GenerateError("Cant figure out how to generate rows from {} ref: {}".format(type(ref), ref))
        else:
            return g
Example #9
0
    def generator(self):
        """
        Return the generator for this URL, if the rowgenerator package is installed.

        :return: A row generator object.
        """

        from rowgenerators import get_generator

        r = self.get_resource()
        t = r.get_target()

        return get_generator(t.get_target(), source_url=self)
Example #10
0
    def load_resource(self):
        """Load rows into a previously created resource table"""

        from rowgenerators import parse_app_url, get_generator

        if not self.loaded:

            url = parse_app_url(self.source_url)
            g = get_generator(url.get_resource().get_target())

            session = inspect(self).session

            session.bulk_insert_mappings(self.mapper, g.iter_dict)

            self.loaded = True
Example #11
0
    def test_sources(self):
        from csv import DictReader

        with open(data_path('sources.csv')) as f:
            for e in DictReader(f):

                if not e['url_class']:
                    print()
                    continue

                u = parse_app_url(e['url'])
                r = u.get_resource()
                t = r.get_target()

                g = get_generator(t)

                self.assertEquals(e['gen_class'], g.__class__.__name__)

                self.assertEquals(int(e['n_rows']), (len(list(g))))
Example #12
0
def run_row_intuit(path, cache):
    from tableintuit import RowIntuiter
    from itertools import islice
    from rowgenerators import TextEncodingError, get_generator

    for encoding in ('ascii', 'utf8', 'latin1'):
        try:

            u = parse_app_url(path)
            u.encoding = encoding

            rows = list(islice(get_generator(
                url=str(u),
                cache=cache,
            ), 5000))
            return encoding, RowIntuiter().run(list(rows))
        except (TextEncodingError, UnicodeEncodeError):
            pass

    raise RowIntuitError('Failed to convert with any encoding')
Example #13
0
    def test_geo(self):

        from rowgenerators.generator.shapefile import ShapefileSource
        from rowgenerators.appurl.shapefile import ShapefileUrl

        us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip'
        u = parse_app_url(us)

        r = u.get_resource()

        self.assertIsInstance(r, ShapefileUrl)

        t = r.get_target()

        self.assertIsInstance(t, ShapefileUrl)

        self.assertTrue(
            str(t).endswith(
                'public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip#SRA2010tiger.shp'
            ))

        g = get_generator(t)

        self.assertIsInstance(g, ShapefileSource)

        self.assertEqual([{
            'name': 'id',
            'type': 'int'
        }, {
            'name': 'SRA',
            'type': 'int'
        }, {
            'name': 'NAME',
            'type': 'str'
        }, {
            'name': 'geometry',
            'type': 'geometry_type'
        }], g.columns)
        self.assertEqual(['id', 'SRA', 'NAME', 'geometry'], g.headers)

        self.assertEquals(42, len(list(g)))
Example #14
0
    def test_geo(self):

        us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip'
        u = parse_app_url(us)

        r = u.get_resource()

        print(type(r), r)

        t = r.get_target()

        print(type(t), t)

        g = get_generator(t)

        print(type(g))

        print(g.columns)
        print(g.headers)

        self.assertEquals(42, len(list(g)))
Example #15
0
    def test_resolve_resource_urls(self):
        """Test how resources are resolved in packages.
            - A name, for excel and CSV packages
            - a path, for ZIP and filesystem packages
            - a web url, for any kind of package
        """
        with open(test_data('packages.csv')) as f:
            for i, l in enumerate(DictReader(f), 2):

                # print(i, l['url'], l['target_file'])

                u = MetapackPackageUrl(l['url'], downloader=Downloader())

                try:
                    t = u.resolve_url(l['target_file'])
                    self.assertFalse(bool(l['resolve_error']))
                except ResourceError:
                    self.assertTrue(bool(l['resolve_error']))
                    continue
                except DownloadError:
                    raise

                # Testing containment because t can have path in local filesystem, which changes depending on where
                # test is run

                # print("   ", t)
                self.assertTrue(l['resolved_url'] in str(t),
                                (i, l['resolved_url'], str(t)))

                try:
                    g = get_generator(t.get_resource().get_target())

                    self.assertEqual(101, len(list(g)))
                    self.assertFalse(bool(l['generate_error']))
                except DownloadError:
                    raise
                except RowGeneratorError:
                    self.assertTrue(bool(l['generate_error']))
                    continue
Example #16
0
    def test_notebook_url(self):

        try:
            from metapack.appurl import JupyterNotebookUrl
            from metapack.jupyter.exec import execute_notebook
            from os.path import exists

            u = parse_app_url(test_data('notebooks', 'GenerateDataTest.ipynb'))

            self.assertIsInstance(u, JupyterNotebookUrl)

            execute_notebook(u.path, '/tmp/nbtest', ['dfa', 'dfb'], True)

            self.assertTrue(exists('/tmp/nbtest/dfa.csv'))
            self.assertTrue(exists('/tmp/nbtest/dfb.csv'))

            g = get_generator(parse_app_url('/tmp/nbtest/dfa.csv'))

            print(list(g))
        except ImportError:
            unittest.skip("Missing pandas or jupyter client")
            return
Example #17
0
    def generate_terms(self, ref, root, file_type=None):
        """An generator that yields term objects, handling includes and argument
        children.
        :param file_type:
        :param doc:
        :param root:
        :param ref:

        """

        last_section = root
        t = None

        if isinstance(ref, Source):
            row_gen = ref
            ref_path = row_gen.__class__.__name__
        else:
            row_gen = get_generator(ref)
            ref_path = ref.path

        try:
            for line_n, row in enumerate(row_gen, 1):

                if not row or not row[0] or not row[0].strip() or row[0].strip(
                ).startswith('#'):
                    continue

                tt = Term(
                    row[0], None
                )  # Just to get the qualified name constructed property

                term_class = self.get_term_class(tt.join_lc)

                t = term_class(tt.join_lc,
                               row[1] if len(row) > 1 else '',
                               row[2:] if len(row) > 2 else [],
                               row=line_n,
                               col=1,
                               file_name=ref_path,
                               file_type=file_type,
                               doc=self.doc)

                # Why did we remove comments from values? It strips out Markdown
                #if t.value and str(t.value).startswith('#'): # Comments are ignored
                #    continue

                if t.term_is('include') or t.term_is('declare'):

                    if t.term_is('include'):
                        resolved = self.find_include_doc(
                            dirname(ref_path), t.value.strip())
                    else:
                        resolved = self.find_declare_doc(
                            dirname(ref_path), t.value.strip())

                    if row_gen.ref == resolved:
                        raise IncludeError(
                            "Include loop for '{}' ".format(resolved))

                    yield t

                    try:

                        sub_gen = get_generator(
                            resolved.get_resource().get_target())

                        for t in self.generate_terms(
                                sub_gen, root, file_type=t.record_term_lc):
                            yield t

                        if last_section:
                            yield last_section  # Re-assert the last section

                    except IncludeError as e:
                        e.term = t
                        raise

                    except (OSError, FileNotFoundError, GenerateError,
                            DownloadError) as e:
                        e = IncludeError("Failed to Include; {}".format(e))
                        e.term = t
                        raise e

                    continue  # Already yielded the include/declare term, and includes can't have children

                elif t.term_is('section'):

                    # If there is already a section in the document, emit the existing section,
                    # rather than a new one.
                    try:
                        last_section = self.doc[t.name]
                        t = last_section

                    except (KeyError,
                            TypeError):  # TypeError -> self.doc is None
                        last_section = t

                yield t

                # Yield any child terms, from the term row arguments
                if not t.term_is('section') and not t.term_is('header'):
                    for col, value in enumerate(t.args, 0):
                        if str(value).strip():

                            term_name = t.record_term_lc + '.' + str(col)

                            term_class = self.get_term_class(term_name)

                            yield term_class(
                                term_name,
                                str(value),
                                [],
                                row=line_n,
                                col=col +
                                2,  # The 0th argument starts in col 2
                                file_name=ref_path,
                                file_type=file_type,
                                parent=t)  #,
                            #doc=None,
                            #section=last_section)
        except IncludeError as e:
            exc = IncludeError(str(e) + "; in '{}' ".format(ref_path))
            exc.term = e.term if hasattr(e, 'term') else None
            raise exc
Example #18
0
    def get_generator(self, cache=None, working_dir=None):
        from rowgenerators import get_generator

        return get_generator(self, cache, working_dir=working_dir)